SOLR-2792: Allow case insensitive Hunspell stemming

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1179459 13f79535-47bb-0310-9956-ffa450edef68
2011-10-05 22:08:55 +00:00 · 2011-10-05 22:08:55 +00:00 · 22dcd39d9e
parent 2c6623a3b3
commit 22dcd39d9e
8 changed files with 165 additions and 32 deletions
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
@ -27,6 +27,7 @@ import java.text.ParseException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Locale;
 public class HunspellDictionary {
@ -43,11 +44,15 @@ public class HunspellDictionary {
  private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
  private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
  private static final boolean IGNORE_CASE_DEFAULT = false;
  private CharArrayMap<List<HunspellWord>> words;
  private CharArrayMap<List<HunspellAffix>> prefixes;
  private CharArrayMap<List<HunspellAffix>> suffixes;
  private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
  private boolean ignoreCase = IGNORE_CASE_DEFAULT;
  private final Version version;
  /**
@ -61,7 +66,22 @@ public class HunspellDictionary {
   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
   */
  public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException {
-    this(affix, Arrays.asList(dictionary), version);
+    this(affix, Arrays.asList(dictionary), version, IGNORE_CASE_DEFAULT);
  }
  /**
   * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
   * and dictionary files
   *
   * @param affix InputStream for reading the hunspell affix file
   * @param dictionary InputStream for reading the hunspell dictionary file
   * @param version Lucene Version
   * @param ignoreCase If true, dictionary matching will be case insensitive
   * @throws IOException Can be thrown while reading from the InputStreams
   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
   */
  public HunspellDictionary(InputStream affix, InputStream dictionary, Version version, boolean ignoreCase) throws IOException, ParseException {
    this(affix, Arrays.asList(dictionary), version, ignoreCase);
  }
  /**
@ -71,15 +91,17 @@ public class HunspellDictionary {
   * @param affix InputStream for reading the hunspell affix file
   * @param dictionaries InputStreams for reading the hunspell dictionary file
   * @param version Lucene Version
   * @param ignoreCase If true, dictionary matching will be case insensitive
   * @throws IOException Can be thrown while reading from the InputStreams
   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
   */
-  public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version) throws IOException, ParseException {
+  public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
    this.version = version;
    this.ignoreCase = ignoreCase;
    String encoding = getDictionaryEncoding(affix);
    CharsetDecoder decoder = getJavaEncoding(encoding);
    readAffixFile(affix, decoder);
-    words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, false);
+    words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, this.ignoreCase);
    for (InputStream dictionary : dictionaries) {
      readDictionaryFile(dictionary, decoder);
    }
@ -129,8 +151,8 @@ public class HunspellDictionary {
   * @throws IOException Can be thrown while reading from the InputStream
   */
  private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException {
-    prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, false);
+    prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
-    suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, false);
+    suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
    BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
    String line = null;
@ -308,6 +330,9 @@ public class HunspellDictionary {
        wordForm = new HunspellWord(flagParsingStrategy.parseFlags(line.substring(flagSep + 1, end)));
        Arrays.sort(wordForm.getFlags());
        entry = line.substring(0, flagSep);
        if(ignoreCase) {
          entry = entry.toLowerCase(Locale.ENGLISH);
        }
      }
      List<HunspellWord> entries = words.get(entry);
@ -408,4 +433,8 @@ public class HunspellDictionary {
      return flags;
    }
  }
  public boolean isIgnoreCase() {
    return ignoreCase;
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
@ -21,9 +21,14 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.text.ParseException;
-import java.util.*;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 import java.util.Scanner;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.CharacterUtils;
 import org.apache.lucene.util.Version;
 /**
@ -36,6 +41,7 @@ public class HunspellStemmer {
  private final HunspellDictionary dictionary;
  private final StringBuilder segment = new StringBuilder();
  private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_40);
  /**
   * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
@ -79,7 +85,7 @@ public class HunspellStemmer {
   */
  public List<Stem> uniqueStems(char word[], int length) {
    List<Stem> stems = new ArrayList<Stem>();
-    CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, false);
+    CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, dictionary.isIgnoreCase());
    if (dictionary.lookupWord(word, 0, length) != null) {
      stems.add(new Stem(word, length));
      terms.add(word);
@ -167,6 +173,12 @@ public class HunspellStemmer {
   */
  @SuppressWarnings("unchecked")
  public List<Stem> applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) {
    if(dictionary.isIgnoreCase()) {
      for(int i=0;i<strippedWord.length;){
        i += Character.toChars(
              Character.toLowerCase(charUtils.codePointAt(strippedWord, i)), strippedWord, i);
      }
    }
    segment.setLength(0);
    segment.append(strippedWord, 0, length);
    if (!affix.checkCondition(segment)) {
@ -294,15 +306,24 @@ public class HunspellStemmer {
   * @throws ParseException Can be thrown while parsing the files
   */
  public static void main(String[] args) throws IOException, ParseException {
-    if (args.length != 2) {
+    boolean ignoreCase = false;
-      System.out.println("usage: HunspellStemmer <affix location> <dic location>");
+    int offset = 0;
    if (args.length < 2) {
      System.out.println("usage: HunspellStemmer [-i] <affix location> <dic location>");
      System.exit(1);
    }
-    InputStream affixInputStream = new FileInputStream(args[0]);
+    if(args[offset].equals("-i")) {
-    InputStream dicInputStream = new FileInputStream(args[1]);
+      ignoreCase = true;
      System.out.println("Ignoring case. All stems will be returned lowercased");
      offset++;
    }
-    HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_40);
+    InputStream affixInputStream = new FileInputStream(args[offset++]);
    InputStream dicInputStream = new FileInputStream(args[offset++]);
    HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_40, ignoreCase);
    affixInputStream.close();
    dicInputStream.close();
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
@ -34,7 +34,7 @@ public class HunspellDictionaryTest {
    InputStream dictStream = getClass().getResourceAsStream("test.dic");
    HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40);
-    assertEquals(2, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
+    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
    assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java
@ -34,14 +34,7 @@ public class HunspellStemmerTest {
  @BeforeClass
  public static void beforeClass() throws IOException, ParseException {
-    InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
+    createStemmer(true);
    InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
    HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40);
    stemmer = new HunspellStemmer(dictionary);
    affixStream.close();
    dictStream.close();
  }
  @Test
@ -73,4 +66,61 @@ public class HunspellStemmerTest {
    assertEquals("ab", stems.get(0).getStemString());
  }
  @Test
  public void testStem_ignoreCase() throws IOException, ParseException {
    List<HunspellStemmer.Stem> stems;
    createStemmer(true);
    stems = stemmer.stem("apache");
    assertEquals(1, stems.size());
    assertEquals("apach", stems.get(0).getStemString());
    stems = stemmer.stem("APACHE");
    assertEquals(1, stems.size());
    assertEquals("apach", stems.get(0).getStemString());
    stems = stemmer.stem("Apache");
    assertEquals(1, stems.size());
    assertEquals("apach", stems.get(0).getStemString());
    stems = stemmer.stem("foos");
    assertEquals(1, stems.size());
    assertEquals("foo", stems.get(0).getStemString());
    stems = stemmer.stem("food");
    assertEquals(1, stems.size());
    assertEquals("foo", stems.get(0).getStemString());
    stems = stemmer.stem("Foos");
    assertEquals(1, stems.size());
    assertEquals("foo", stems.get(0).getStemString());
    stems = stemmer.stem("Food");
    assertEquals(1, stems.size());
    assertEquals("foo", stems.get(0).getStemString());
  }
  @Test
  public void testStem_caseSensitive() throws IOException, ParseException {
    createStemmer(false);
    List<HunspellStemmer.Stem> stems = stemmer.stem("apache");
    assertEquals(0, stems.size());
    stems = stemmer.stem("Apache");
    assertEquals(1, stems.size());
    assertEquals("Apach", stems.get(0).getStemString());
  }
  private static void createStemmer(boolean ignoreCase) throws IOException, ParseException {
    InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
    InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
    HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40, ignoreCase);
    stemmer = new HunspellStemmer(dictionary);
    affixStream.close();
    dictStream.close();
  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff
@ -1,13 +1,20 @@
 SET UTF-8
 TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
-SFX A Y 2
+SFX A Y 3
 SFX A   0     e         n
 SFX A   0     e         t
 SFX A   0     e         h
 SFX C Y 2
 SFX C   0     d/C       c
 SFX C   0     c         b
 SFX D Y 1
 SFX D   0     s         o
 SFX E Y 1
 SFX E   0     d         o
 PFX B Y 1
 PFX B   0     s         o
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic
@ -1,6 +1,9 @@
-5
+6
 lucen/A
 lucene
 mahout/A
 olr/B
 ab/C
 Apach/A
 foo/D
 Foo/E
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -408,6 +408,8 @@ Bug Fixes
 * SOLR-2372: Upgrade Solr to Tika 0.10 (janhoy)
 * SOLR-2792: Allow case insensitive Hunspell stemming (janhoy, rmuir)
 ==================  3.4.0  ==================
 Upgrading from Solr 3.3
--- a/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java
+++ b/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java
@ -25,21 +25,36 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.hunspell.HunspellDictionary;
 import org.apache.lucene.analysis.hunspell.HunspellStemFilter;
 import org.apache.solr.common.ResourceLoader;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.util.plugin.ResourceLoaderAware;
 /**
 * TokenFilterFactory that creates instances of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}.
- * Example config for British English including a custom dictionary:
+ * Example config for British English including a custom dictionary, case insensitive matching:
 * <pre class="prettyprint" >
 * &lt;filter class=&quot;solr.HunspellStemFilterFactory&quot;
 *    dictionary=&quot;en_GB.dic,my_custom.dic&quot;
- *    affix=&quot;en_GB.aff&quot;/&gt;</pre>
+ *    affix=&quot;en_GB.aff&quot;
- * Dictionaries for many languages are available through the OpenOffice project
+ *    ignoreCase=&quot;true&quot; /&gt;</pre>
- * <p>See: <a href="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice Dictionaries</a>
+ * Both parameters dictionary and affix are mandatory.
 * <br/>
 * The parameter ignoreCase (true/false) controls whether matching is case sensitive or not. Default false.
 * <br/> 
 * Dictionaries for many languages are available through the OpenOffice project.
 * 
 * See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
 */
 public class HunspellStemFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
  private static final String PARAM_DICTIONARY = "dictionary";
  private static final String PARAM_AFFIX = "affix";
  private static final String PARAM_IGNORE_CASE = "ignoreCase";
  private static final String TRUE = "true";
  private static final String FALSE = "false";
  private HunspellDictionary dictionary;
  private boolean ignoreCase = false;
  /**
   * Loads the hunspell dictionary and affix files defined in the configuration
@ -48,15 +63,21 @@ public class HunspellStemFilterFactory extends BaseTokenFilterFactory implements
   */
  public void inform(ResourceLoader loader) {
    assureMatchVersion();
-    String dictionaryFiles[] = args.get("dictionary").split(",");
+    String dictionaryFiles[] = args.get(PARAM_DICTIONARY).split(",");
-    String affixFile = args.get("affix");
+    String affixFile = args.get(PARAM_AFFIX);
    String pic = args.get(PARAM_IGNORE_CASE);
    if(pic != null) {
      if(pic.equalsIgnoreCase(TRUE)) ignoreCase = true;
      else if(pic.equalsIgnoreCase(FALSE)) ignoreCase = false;
      else throw new SolrException(ErrorCode.UNKNOWN, "Unknown value for "+PARAM_IGNORE_CASE+": "+pic+". Must be true or false");
    }
    try {
      List<InputStream> dictionaries = new ArrayList<InputStream>();
      for (String file : dictionaryFiles) {
        dictionaries.add(loader.openResource(file));
      }
-      this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion);
+      this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion, ignoreCase);
    } catch (Exception e) {
      throw new RuntimeException("Unable to load hunspell data! [dictionary=" + args.get("dictionary") + ",affix=" + affixFile + "]", e);
    }