SOLR-2792: Allow case insensitive Hunspell stemming

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1179459 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jan Høydahl 2011-10-05 22:08:55 +00:00
parent 2c6623a3b3
commit 22dcd39d9e
8 changed files with 165 additions and 32 deletions

View File

@ -27,6 +27,7 @@ import java.text.ParseException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Locale;
public class HunspellDictionary { public class HunspellDictionary {
@ -43,11 +44,15 @@ public class HunspellDictionary {
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*"; private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s"; private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
private static final boolean IGNORE_CASE_DEFAULT = false;
private CharArrayMap<List<HunspellWord>> words; private CharArrayMap<List<HunspellWord>> words;
private CharArrayMap<List<HunspellAffix>> prefixes; private CharArrayMap<List<HunspellAffix>> prefixes;
private CharArrayMap<List<HunspellAffix>> suffixes; private CharArrayMap<List<HunspellAffix>> suffixes;
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
private boolean ignoreCase = IGNORE_CASE_DEFAULT;
private final Version version; private final Version version;
/** /**
@ -61,7 +66,22 @@ public class HunspellDictionary {
* @throws ParseException Can be thrown if the content of the files does not meet expected formats * @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/ */
public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException { public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException {
this(affix, Arrays.asList(dictionary), version); this(affix, Arrays.asList(dictionary), version, IGNORE_CASE_DEFAULT);
}
/**
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files
*
* @param affix InputStream for reading the hunspell affix file
* @param dictionary InputStream for reading the hunspell dictionary file
* @param version Lucene Version
* @param ignoreCase If true, dictionary matching will be case insensitive
* @throws IOException Can be thrown while reading from the InputStreams
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/
public HunspellDictionary(InputStream affix, InputStream dictionary, Version version, boolean ignoreCase) throws IOException, ParseException {
this(affix, Arrays.asList(dictionary), version, ignoreCase);
} }
/** /**
@ -71,15 +91,17 @@ public class HunspellDictionary {
* @param affix InputStream for reading the hunspell affix file * @param affix InputStream for reading the hunspell affix file
* @param dictionaries InputStreams for reading the hunspell dictionary file * @param dictionaries InputStreams for reading the hunspell dictionary file
* @param version Lucene Version * @param version Lucene Version
* @param ignoreCase If true, dictionary matching will be case insensitive
* @throws IOException Can be thrown while reading from the InputStreams * @throws IOException Can be thrown while reading from the InputStreams
* @throws ParseException Can be thrown if the content of the files does not meet expected formats * @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/ */
public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version) throws IOException, ParseException { public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
this.version = version; this.version = version;
this.ignoreCase = ignoreCase;
String encoding = getDictionaryEncoding(affix); String encoding = getDictionaryEncoding(affix);
CharsetDecoder decoder = getJavaEncoding(encoding); CharsetDecoder decoder = getJavaEncoding(encoding);
readAffixFile(affix, decoder); readAffixFile(affix, decoder);
words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, false); words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, this.ignoreCase);
for (InputStream dictionary : dictionaries) { for (InputStream dictionary : dictionaries) {
readDictionaryFile(dictionary, decoder); readDictionaryFile(dictionary, decoder);
} }
@ -129,8 +151,8 @@ public class HunspellDictionary {
* @throws IOException Can be thrown while reading from the InputStream * @throws IOException Can be thrown while reading from the InputStream
*/ */
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException { private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException {
prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, false); prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, false); suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder)); BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
String line = null; String line = null;
@ -308,6 +330,9 @@ public class HunspellDictionary {
wordForm = new HunspellWord(flagParsingStrategy.parseFlags(line.substring(flagSep + 1, end))); wordForm = new HunspellWord(flagParsingStrategy.parseFlags(line.substring(flagSep + 1, end)));
Arrays.sort(wordForm.getFlags()); Arrays.sort(wordForm.getFlags());
entry = line.substring(0, flagSep); entry = line.substring(0, flagSep);
if(ignoreCase) {
entry = entry.toLowerCase(Locale.ENGLISH);
}
} }
List<HunspellWord> entries = words.get(entry); List<HunspellWord> entries = words.get(entry);
@ -408,4 +433,8 @@ public class HunspellDictionary {
return flags; return flags;
} }
} }
public boolean isIgnoreCase() {
return ignoreCase;
}
} }

View File

@ -21,9 +21,14 @@ import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.text.ParseException; import java.text.ParseException;
import java.util.*; import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Scanner;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
/** /**
@ -36,6 +41,7 @@ public class HunspellStemmer {
private final HunspellDictionary dictionary; private final HunspellDictionary dictionary;
private final StringBuilder segment = new StringBuilder(); private final StringBuilder segment = new StringBuilder();
private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_40);
/** /**
* Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
@ -79,7 +85,7 @@ public class HunspellStemmer {
*/ */
public List<Stem> uniqueStems(char word[], int length) { public List<Stem> uniqueStems(char word[], int length) {
List<Stem> stems = new ArrayList<Stem>(); List<Stem> stems = new ArrayList<Stem>();
CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, false); CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, dictionary.isIgnoreCase());
if (dictionary.lookupWord(word, 0, length) != null) { if (dictionary.lookupWord(word, 0, length) != null) {
stems.add(new Stem(word, length)); stems.add(new Stem(word, length));
terms.add(word); terms.add(word);
@ -167,6 +173,12 @@ public class HunspellStemmer {
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public List<Stem> applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) { public List<Stem> applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) {
if(dictionary.isIgnoreCase()) {
for(int i=0;i<strippedWord.length;){
i += Character.toChars(
Character.toLowerCase(charUtils.codePointAt(strippedWord, i)), strippedWord, i);
}
}
segment.setLength(0); segment.setLength(0);
segment.append(strippedWord, 0, length); segment.append(strippedWord, 0, length);
if (!affix.checkCondition(segment)) { if (!affix.checkCondition(segment)) {
@ -294,15 +306,24 @@ public class HunspellStemmer {
* @throws ParseException Can be thrown while parsing the files * @throws ParseException Can be thrown while parsing the files
*/ */
public static void main(String[] args) throws IOException, ParseException { public static void main(String[] args) throws IOException, ParseException {
if (args.length != 2) { boolean ignoreCase = false;
System.out.println("usage: HunspellStemmer <affix location> <dic location>"); int offset = 0;
if (args.length < 2) {
System.out.println("usage: HunspellStemmer [-i] <affix location> <dic location>");
System.exit(1); System.exit(1);
} }
InputStream affixInputStream = new FileInputStream(args[0]); if(args[offset].equals("-i")) {
InputStream dicInputStream = new FileInputStream(args[1]); ignoreCase = true;
System.out.println("Ignoring case. All stems will be returned lowercased");
offset++;
}
HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_40); InputStream affixInputStream = new FileInputStream(args[offset++]);
InputStream dicInputStream = new FileInputStream(args[offset++]);
HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_40, ignoreCase);
affixInputStream.close(); affixInputStream.close();
dicInputStream.close(); dicInputStream.close();

View File

@ -34,7 +34,7 @@ public class HunspellDictionaryTest {
InputStream dictStream = getClass().getResourceAsStream("test.dic"); InputStream dictStream = getClass().getResourceAsStream("test.dic");
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40); HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40);
assertEquals(2, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size()); assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size()); assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size()); assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());

View File

@ -34,14 +34,7 @@ public class HunspellStemmerTest {
@BeforeClass @BeforeClass
public static void beforeClass() throws IOException, ParseException { public static void beforeClass() throws IOException, ParseException {
InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff"); createStemmer(true);
InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40);
stemmer = new HunspellStemmer(dictionary);
affixStream.close();
dictStream.close();
} }
@Test @Test
@ -73,4 +66,61 @@ public class HunspellStemmerTest {
assertEquals("ab", stems.get(0).getStemString()); assertEquals("ab", stems.get(0).getStemString());
} }
@Test
public void testStem_ignoreCase() throws IOException, ParseException {
List<HunspellStemmer.Stem> stems;
createStemmer(true);
stems = stemmer.stem("apache");
assertEquals(1, stems.size());
assertEquals("apach", stems.get(0).getStemString());
stems = stemmer.stem("APACHE");
assertEquals(1, stems.size());
assertEquals("apach", stems.get(0).getStemString());
stems = stemmer.stem("Apache");
assertEquals(1, stems.size());
assertEquals("apach", stems.get(0).getStemString());
stems = stemmer.stem("foos");
assertEquals(1, stems.size());
assertEquals("foo", stems.get(0).getStemString());
stems = stemmer.stem("food");
assertEquals(1, stems.size());
assertEquals("foo", stems.get(0).getStemString());
stems = stemmer.stem("Foos");
assertEquals(1, stems.size());
assertEquals("foo", stems.get(0).getStemString());
stems = stemmer.stem("Food");
assertEquals(1, stems.size());
assertEquals("foo", stems.get(0).getStemString());
}
@Test
public void testStem_caseSensitive() throws IOException, ParseException {
createStemmer(false);
List<HunspellStemmer.Stem> stems = stemmer.stem("apache");
assertEquals(0, stems.size());
stems = stemmer.stem("Apache");
assertEquals(1, stems.size());
assertEquals("Apach", stems.get(0).getStemString());
}
private static void createStemmer(boolean ignoreCase) throws IOException, ParseException {
InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40, ignoreCase);
stemmer = new HunspellStemmer(dictionary);
affixStream.close();
dictStream.close();
}
} }

View File

@ -1,13 +1,20 @@
SET UTF-8 SET UTF-8
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
SFX A Y 2 SFX A Y 3
SFX A 0 e n SFX A 0 e n
SFX A 0 e t SFX A 0 e t
SFX A 0 e h
SFX C Y 2 SFX C Y 2
SFX C 0 d/C c SFX C 0 d/C c
SFX C 0 c b SFX C 0 c b
SFX D Y 1
SFX D 0 s o
SFX E Y 1
SFX E 0 d o
PFX B Y 1 PFX B Y 1
PFX B 0 s o PFX B 0 s o

View File

@ -1,6 +1,9 @@
5 6
lucen/A lucen/A
lucene lucene
mahout/A mahout/A
olr/B olr/B
ab/C ab/C
Apach/A
foo/D
Foo/E

View File

@ -408,6 +408,8 @@ Bug Fixes
* SOLR-2372: Upgrade Solr to Tika 0.10 (janhoy) * SOLR-2372: Upgrade Solr to Tika 0.10 (janhoy)
* SOLR-2792: Allow case insensitive Hunspell stemming (janhoy, rmuir)
================== 3.4.0 ================== ================== 3.4.0 ==================
Upgrading from Solr 3.3 Upgrading from Solr 3.3

View File

@ -25,21 +25,36 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.hunspell.HunspellDictionary; import org.apache.lucene.analysis.hunspell.HunspellDictionary;
import org.apache.lucene.analysis.hunspell.HunspellStemFilter; import org.apache.lucene.analysis.hunspell.HunspellStemFilter;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.solr.util.plugin.ResourceLoaderAware;
/** /**
* TokenFilterFactory that creates instances of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}. * TokenFilterFactory that creates instances of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}.
* Example config for British English including a custom dictionary: * Example config for British English including a custom dictionary, case insensitive matching:
* <pre class="prettyprint" > * <pre class="prettyprint" >
* &lt;filter class=&quot;solr.HunspellStemFilterFactory&quot; * &lt;filter class=&quot;solr.HunspellStemFilterFactory&quot;
* dictionary=&quot;en_GB.dic,my_custom.dic&quot; * dictionary=&quot;en_GB.dic,my_custom.dic&quot;
* affix=&quot;en_GB.aff&quot;/&gt;</pre> * affix=&quot;en_GB.aff&quot;
* Dictionaries for many languages are available through the OpenOffice project * ignoreCase=&quot;true&quot; /&gt;</pre>
* <p>See: <a href="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice Dictionaries</a> * Both parameters dictionary and affix are mandatory.
* <br/>
* The parameter ignoreCase (true/false) controls whether matching is case sensitive or not. Default false.
* <br/>
* Dictionaries for many languages are available through the OpenOffice project.
*
* See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
*/ */
public class HunspellStemFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { public class HunspellStemFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
private static final String PARAM_DICTIONARY = "dictionary";
private static final String PARAM_AFFIX = "affix";
private static final String PARAM_IGNORE_CASE = "ignoreCase";
private static final String TRUE = "true";
private static final String FALSE = "false";
private HunspellDictionary dictionary; private HunspellDictionary dictionary;
private boolean ignoreCase = false;
/** /**
* Loads the hunspell dictionary and affix files defined in the configuration * Loads the hunspell dictionary and affix files defined in the configuration
@ -48,15 +63,21 @@ public class HunspellStemFilterFactory extends BaseTokenFilterFactory implements
*/ */
public void inform(ResourceLoader loader) { public void inform(ResourceLoader loader) {
assureMatchVersion(); assureMatchVersion();
String dictionaryFiles[] = args.get("dictionary").split(","); String dictionaryFiles[] = args.get(PARAM_DICTIONARY).split(",");
String affixFile = args.get("affix"); String affixFile = args.get(PARAM_AFFIX);
String pic = args.get(PARAM_IGNORE_CASE);
if(pic != null) {
if(pic.equalsIgnoreCase(TRUE)) ignoreCase = true;
else if(pic.equalsIgnoreCase(FALSE)) ignoreCase = false;
else throw new SolrException(ErrorCode.UNKNOWN, "Unknown value for "+PARAM_IGNORE_CASE+": "+pic+". Must be true or false");
}
try { try {
List<InputStream> dictionaries = new ArrayList<InputStream>(); List<InputStream> dictionaries = new ArrayList<InputStream>();
for (String file : dictionaryFiles) { for (String file : dictionaryFiles) {
dictionaries.add(loader.openResource(file)); dictionaries.add(loader.openResource(file));
} }
this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion); this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion, ignoreCase);
} catch (Exception e) { } catch (Exception e) {
throw new RuntimeException("Unable to load hunspell data! [dictionary=" + args.get("dictionary") + ",affix=" + affixFile + "]", e); throw new RuntimeException("Unable to load hunspell data! [dictionary=" + args.get("dictionary") + ",affix=" + affixFile + "]", e);
} }