SOLR-2792: Allow case insensitive Hunspell stemming

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1179459 13f79535-47bb-0310-9956-ffa450edef68
2011-10-05 22:08:55 +00:00 · 2011-10-05 22:08:55 +00:00 · 22dcd39d9e
parent 2c6623a3b3
commit 22dcd39d9e
8 changed files with 165 additions and 32 deletions
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
@ -27,6 +27,7 @@ import java.text.ParseException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Locale;

 public class HunspellDictionary {

@ -43,11 +44,15 @@ public class HunspellDictionary {
  private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
  private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";

+  private static final boolean IGNORE_CASE_DEFAULT = false;
+
  private CharArrayMap<List<HunspellWord>> words;
  private CharArrayMap<List<HunspellAffix>> prefixes;
  private CharArrayMap<List<HunspellAffix>> suffixes;

  private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
+  private boolean ignoreCase = IGNORE_CASE_DEFAULT;
+
  private final Version version;

  /**
@ -61,7 +66,22 @@ public class HunspellDictionary {
   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
   */
  public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException {
-    this(affix, Arrays.asList(dictionary), version);
+    this(affix, Arrays.asList(dictionary), version, IGNORE_CASE_DEFAULT);
+  }
+
+  /**
+   * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
+   * and dictionary files
+   *
+   * @param affix InputStream for reading the hunspell affix file
+   * @param dictionary InputStream for reading the hunspell dictionary file
+   * @param version Lucene Version
+   * @param ignoreCase If true, dictionary matching will be case insensitive
+   * @throws IOException Can be thrown while reading from the InputStreams
+   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
+   */
+  public HunspellDictionary(InputStream affix, InputStream dictionary, Version version, boolean ignoreCase) throws IOException, ParseException {
+    this(affix, Arrays.asList(dictionary), version, ignoreCase);
  }

  /**
@ -71,15 +91,17 @@ public class HunspellDictionary {
   * @param affix InputStream for reading the hunspell affix file
   * @param dictionaries InputStreams for reading the hunspell dictionary file
   * @param version Lucene Version
+   * @param ignoreCase If true, dictionary matching will be case insensitive
   * @throws IOException Can be thrown while reading from the InputStreams
   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
   */
-  public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version) throws IOException, ParseException {
+  public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
    this.version = version;
+    this.ignoreCase = ignoreCase;
    String encoding = getDictionaryEncoding(affix);
    CharsetDecoder decoder = getJavaEncoding(encoding);
    readAffixFile(affix, decoder);
-    words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, false);
+    words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, this.ignoreCase);
    for (InputStream dictionary : dictionaries) {
      readDictionaryFile(dictionary, decoder);
    }
@ -129,8 +151,8 @@ public class HunspellDictionary {
   * @throws IOException Can be thrown while reading from the InputStream
   */
  private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException {
-    prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, false);
-    suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, false);
+    prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
+    suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
    
    BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
    String line = null;
@ -308,6 +330,9 @@ public class HunspellDictionary {
        wordForm = new HunspellWord(flagParsingStrategy.parseFlags(line.substring(flagSep + 1, end)));
        Arrays.sort(wordForm.getFlags());
        entry = line.substring(0, flagSep);
+        if(ignoreCase) {
+          entry = entry.toLowerCase(Locale.ENGLISH);
+        }
      }
      
      List<HunspellWord> entries = words.get(entry);
@ -408,4 +433,8 @@ public class HunspellDictionary {
      return flags;
    }
  }
+
+  public boolean isIgnoreCase() {
+    return ignoreCase;
+  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
@ -21,9 +21,14 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.text.ParseException;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Scanner;

 import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.CharacterUtils;
 import org.apache.lucene.util.Version;

 /**
@ -36,6 +41,7 @@ public class HunspellStemmer {
  
  private final HunspellDictionary dictionary;
  private final StringBuilder segment = new StringBuilder();
+  private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_40);

  /**
   * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
@ -79,7 +85,7 @@ public class HunspellStemmer {
   */
  public List<Stem> uniqueStems(char word[], int length) {
    List<Stem> stems = new ArrayList<Stem>();
-    CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, false);
+    CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, dictionary.isIgnoreCase());
    if (dictionary.lookupWord(word, 0, length) != null) {
      stems.add(new Stem(word, length));
      terms.add(word);
@ -167,6 +173,12 @@ public class HunspellStemmer {
   */
  @SuppressWarnings("unchecked")
  public List<Stem> applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) {
+    if(dictionary.isIgnoreCase()) {
+      for(int i=0;i<strippedWord.length;){
+        i += Character.toChars(
+              Character.toLowerCase(charUtils.codePointAt(strippedWord, i)), strippedWord, i);
+      }
+    }
    segment.setLength(0);
    segment.append(strippedWord, 0, length);
    if (!affix.checkCondition(segment)) {
@ -174,7 +186,7 @@ public class HunspellStemmer {
    }

    List<Stem> stems = new ArrayList<Stem>();
-    
+
    List<HunspellWord> words = dictionary.lookupWord(strippedWord, 0, length);
    if (words != null) {
      for (HunspellWord hunspellWord : words) {
@ -294,15 +306,24 @@ public class HunspellStemmer {
   * @throws ParseException Can be thrown while parsing the files
   */
  public static void main(String[] args) throws IOException, ParseException {
-    if (args.length != 2) {
-      System.out.println("usage: HunspellStemmer <affix location> <dic location>");
+    boolean ignoreCase = false;
+    int offset = 0;
+    
+    if (args.length < 2) {
+      System.out.println("usage: HunspellStemmer [-i] <affix location> <dic location>");
      System.exit(1);
    }

-    InputStream affixInputStream = new FileInputStream(args[0]);
-    InputStream dicInputStream = new FileInputStream(args[1]);
+    if(args[offset].equals("-i")) {
+      ignoreCase = true;
+      System.out.println("Ignoring case. All stems will be returned lowercased");
+      offset++;
+    }
+    
+    InputStream affixInputStream = new FileInputStream(args[offset++]);
+    InputStream dicInputStream = new FileInputStream(args[offset++]);

-    HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_40);
+    HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_40, ignoreCase);

    affixInputStream.close();
    dicInputStream.close();
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
@ -34,7 +34,7 @@ public class HunspellDictionaryTest {
    InputStream dictStream = getClass().getResourceAsStream("test.dic");

    HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40);
-    assertEquals(2, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
+    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
    assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());

--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java
@ -34,14 +34,7 @@ public class HunspellStemmerTest {

  @BeforeClass
  public static void beforeClass() throws IOException, ParseException {
-    InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
-    InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
-
-    HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40);
-    stemmer = new HunspellStemmer(dictionary);
-
-    affixStream.close();
-    dictStream.close();
+    createStemmer(true);
  }

  @Test
@ -73,4 +66,61 @@ public class HunspellStemmerTest {
    assertEquals("ab", stems.get(0).getStemString());
  }

+  @Test
+  public void testStem_ignoreCase() throws IOException, ParseException {
+    List<HunspellStemmer.Stem> stems;
+    createStemmer(true);
+
+    stems = stemmer.stem("apache");
+    assertEquals(1, stems.size());
+    assertEquals("apach", stems.get(0).getStemString());
+
+    stems = stemmer.stem("APACHE");
+    assertEquals(1, stems.size());
+    assertEquals("apach", stems.get(0).getStemString());
+
+    stems = stemmer.stem("Apache");
+    assertEquals(1, stems.size());
+    assertEquals("apach", stems.get(0).getStemString());
+    
+    stems = stemmer.stem("foos");
+    assertEquals(1, stems.size());
+    assertEquals("foo", stems.get(0).getStemString());
+    
+    stems = stemmer.stem("food");
+    assertEquals(1, stems.size());
+    assertEquals("foo", stems.get(0).getStemString());
+    
+    stems = stemmer.stem("Foos");
+    assertEquals(1, stems.size());
+    assertEquals("foo", stems.get(0).getStemString());
+    
+    stems = stemmer.stem("Food");
+    assertEquals(1, stems.size());
+    assertEquals("foo", stems.get(0).getStemString());
+  }
+
+  @Test
+  public void testStem_caseSensitive() throws IOException, ParseException {
+    createStemmer(false);
+    List<HunspellStemmer.Stem> stems = stemmer.stem("apache");
+    assertEquals(0, stems.size());
+
+    stems = stemmer.stem("Apache");
+    assertEquals(1, stems.size());
+    assertEquals("Apach", stems.get(0).getStemString());
+  }
+
+  
+  private static void createStemmer(boolean ignoreCase) throws IOException, ParseException {
+    InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
+    InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
+
+    HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40, ignoreCase);
+    stemmer = new HunspellStemmer(dictionary);
+
+    affixStream.close();
+    dictStream.close();
+  }
+
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff
@ -1,13 +1,20 @@
 SET UTF-8
 TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

-SFX A Y 2
+SFX A Y 3
 SFX A   0     e         n
 SFX A   0     e         t
+SFX A   0     e         h

 SFX C Y 2
 SFX C   0     d/C       c
 SFX C   0     c         b

+SFX D Y 1
+SFX D   0     s         o
+
+SFX E Y 1
+SFX E   0     d         o
+
 PFX B Y 1
 PFX B   0     s         o
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic
@ -1,6 +1,9 @@
-5
+6
 lucen/A
 lucene
 mahout/A
 olr/B
-ab/C
+ab/C
+Apach/A
+foo/D
+Foo/E
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -408,6 +408,8 @@ Bug Fixes

 * SOLR-2372: Upgrade Solr to Tika 0.10 (janhoy)

+* SOLR-2792: Allow case insensitive Hunspell stemming (janhoy, rmuir)
+
 ==================  3.4.0  ==================

 Upgrading from Solr 3.3
--- a/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java
+++ b/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java
@ -25,21 +25,36 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.hunspell.HunspellDictionary;
 import org.apache.lucene.analysis.hunspell.HunspellStemFilter;
 import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.util.plugin.ResourceLoaderAware;

 /**
 * TokenFilterFactory that creates instances of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}.
- * Example config for British English including a custom dictionary:
+ * Example config for British English including a custom dictionary, case insensitive matching:
 * <pre class="prettyprint" >
 * &lt;filter class=&quot;solr.HunspellStemFilterFactory&quot;
 *    dictionary=&quot;en_GB.dic,my_custom.dic&quot;
- *    affix=&quot;en_GB.aff&quot;/&gt;</pre>
- * Dictionaries for many languages are available through the OpenOffice project
- * <p>See: <a href="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice Dictionaries</a>
+ *    affix=&quot;en_GB.aff&quot;
+ *    ignoreCase=&quot;true&quot; /&gt;</pre>
+ * Both parameters dictionary and affix are mandatory.
+ * <br/>
+ * The parameter ignoreCase (true/false) controls whether matching is case sensitive or not. Default false.
+ * <br/> 
+ * Dictionaries for many languages are available through the OpenOffice project.
+ * 
+ * See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
 */
 public class HunspellStemFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
  
+  private static final String PARAM_DICTIONARY = "dictionary";
+  private static final String PARAM_AFFIX = "affix";
+  private static final String PARAM_IGNORE_CASE = "ignoreCase";
+  private static final String TRUE = "true";
+  private static final String FALSE = "false";
+  
  private HunspellDictionary dictionary;
+  private boolean ignoreCase = false;

  /**
   * Loads the hunspell dictionary and affix files defined in the configuration
@ -48,15 +63,21 @@ public class HunspellStemFilterFactory extends BaseTokenFilterFactory implements
   */
  public void inform(ResourceLoader loader) {
    assureMatchVersion();
-    String dictionaryFiles[] = args.get("dictionary").split(",");
-    String affixFile = args.get("affix");
+    String dictionaryFiles[] = args.get(PARAM_DICTIONARY).split(",");
+    String affixFile = args.get(PARAM_AFFIX);
+    String pic = args.get(PARAM_IGNORE_CASE);
+    if(pic != null) {
+      if(pic.equalsIgnoreCase(TRUE)) ignoreCase = true;
+      else if(pic.equalsIgnoreCase(FALSE)) ignoreCase = false;
+      else throw new SolrException(ErrorCode.UNKNOWN, "Unknown value for "+PARAM_IGNORE_CASE+": "+pic+". Must be true or false");
+    }

    try {
      List<InputStream> dictionaries = new ArrayList<InputStream>();
      for (String file : dictionaryFiles) {
        dictionaries.add(loader.openResource(file));
      }
-      this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion);
+      this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion, ignoreCase);
    } catch (Exception e) {
      throw new RuntimeException("Unable to load hunspell data! [dictionary=" + args.get("dictionary") + ",affix=" + affixFile + "]", e);
    }