LUCENE-3414: Added Hunspell for Lucene

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1167467 13f79535-47bb-0310-9956-ffa450edef68
2011-09-10 06:00:39 +00:00 · 2011-09-10 06:00:39 +00:00 · e3172b9239
parent 397d68e080
commit e3172b9239
11 changed files with 1280 additions and 0 deletions
--- a/modules/analysis/CHANGES.txt
+++ b/modules/analysis/CHANGES.txt
@ -98,6 +98,9 @@ New Features
 * SOLR-1057: Add PathHierarchyTokenizer that represents file path hierarchies as synonyms of
   /something, /something/something, /something/something/else. (Ryan McKinley, Koji Sekiguchi)

+ * LUCENE-3414: Added HunspellStemFilter which uses a provided pure Java implementation of the 
+   Hunspell algorithm. (Chris Male)
+
 Build

 * LUCENE-2413: All analyzers in contrib/analyzers and contrib/icu were moved to the 
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java
@ -0,0 +1,157 @@
+package org.apache.lucene.analysis.hunspell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.regex.Pattern;
+
+/**
+ * Wrapper class representing a hunspell affix
+ */
+public class HunspellAffix {
+
+  private String append; // the affix itself, what is appended
+  private char appendFlags[]; // continuation class flags
+  private String strip;
+  
+  private String condition;
+  private Pattern conditionPattern;
+  
+  private char flag;
+
+  private boolean crossProduct;
+
+  /**
+   * Checks whether the given text matches the conditional pattern on this affix
+   *
+   * @param text Text to check if it matches the affix's conditional pattern
+   * @return {@code true} if the text meets the condition, {@code false} otherwise
+   */
+  public boolean checkCondition(CharSequence text) {
+    return conditionPattern.matcher(text).matches();
+  }
+
+  /**
+   * Returns the append defined for the affix
+   *
+   * @return Defined append
+   */
+  public String getAppend() {
+    return append;
+  }
+
+  /**
+   * Sets the append defined for the affix
+   *
+   * @param append Defined append for the affix
+   */
+  public void setAppend(String append) {
+    this.append = append;
+  }
+
+  /**
+   * Returns the flags defined for the affix append
+   *
+   * @return Flags defined for the affix append
+   */
+  public char[] getAppendFlags() {
+    return appendFlags;
+  }
+
+  /**
+   * Sets the flags defined for the affix append
+   *
+   * @param appendFlags Flags defined for the affix append
+   */
+  public void setAppendFlags(char[] appendFlags) {
+    this.appendFlags = appendFlags;
+  }
+
+  /**
+   * Returns the stripping characters defined for the affix
+   *
+   * @return Stripping characters defined for the affix
+   */
+  public String getStrip() {
+    return strip;
+  }
+
+  /**
+   * Sets the stripping characters defined for the affix
+   *
+   * @param strip Stripping characters defined for the affix
+   */
+  public void setStrip(String strip) {
+    this.strip = strip;
+  }
+
+  /**
+   * Returns the condition that must be met before the affix can be applied
+   *
+   * @return Condition that must be met before the affix can be applied
+   */
+  public String getCondition() {
+    return condition;
+  }
+
+  /**
+   * Sets the condition that must be met before the affix can be applied
+   *
+   * @param condition Condition to be met before affix application
+   * @param pattern Condition as a regular expression pattern
+   */
+  public void setCondition(String condition, String pattern) {
+    this.condition = condition;
+    this.conditionPattern = Pattern.compile(pattern);
+  }
+
+  /**
+   * Returns the affix flag
+   *
+   * @return Affix flag
+   */
+  public char getFlag() {
+    return flag;
+  }
+
+  /**
+   * Sets the affix flag
+   *
+   * @param flag Affix flag
+   */
+  public void setFlag(char flag) {
+    this.flag = flag;
+  }
+
+  /**
+   * Returns whether the affix is defined as cross product
+   *
+   * @return {@code true} if the affix is cross product, {@code false} otherwise
+   */
+  public boolean isCrossProduct() {
+    return crossProduct;
+  }
+
+  /**
+   * Sets whether the affix is defined as cross product
+   *
+   * @param crossProduct Whether the affix is defined as cross product
+   */
+  public void setCrossProduct(boolean crossProduct) {
+    this.crossProduct = crossProduct;
+  }
+}
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
@ -0,0 +1,411 @@
+package org.apache.lucene.analysis.hunspell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.util.CharArrayMap;
+import org.apache.lucene.util.Version;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.text.ParseException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+public class HunspellDictionary {
+
+  static final HunspellWord NOFLAGS = new HunspellWord();
+  
+  private static final String PREFIX_KEY = "PFX";
+  private static final String SUFFIX_KEY = "SFX";
+  private static final String FLAG_KEY = "FLAG";
+
+  private static final String NUM_FLAG_TYPE = "num";
+  private static final String UTF8_FLAG_TYPE = "UTF-8";
+  private static final String LONG_FLAG_TYPE = "long";
+  
+  private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
+  private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
+
+  private CharArrayMap<List<HunspellWord>> words;
+  private CharArrayMap<List<HunspellAffix>> prefixes;
+  private CharArrayMap<List<HunspellAffix>> suffixes;
+
+  private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
+  private final Version version;
+
+  /**
+   * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
+   * and dictionary files
+   *
+   * @param affix InputStream for reading the hunspell affix file
+   * @param dictionary InputStream for reading the hunspell dictionary file
+   * @param version Lucene Version
+   * @throws IOException Can be thrown while reading from the InputStreams
+   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
+   */
+  public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException {
+    this(affix, Arrays.asList(dictionary), version);
+  }
+
+  /**
+   * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
+   * and dictionary files
+   *
+   * @param affix InputStream for reading the hunspell affix file
+   * @param dictionaries InputStreams for reading the hunspell dictionary file
+   * @param version Lucene Version
+   * @throws IOException Can be thrown while reading from the InputStreams
+   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
+   */
+  public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version) throws IOException, ParseException {
+    this.version = version;
+    String encoding = getDictionaryEncoding(affix);
+    CharsetDecoder decoder = getJavaEncoding(encoding);
+    readAffixFile(affix, decoder);
+    words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, false);
+    for (InputStream dictionary : dictionaries) {
+      readDictionaryFile(dictionary, decoder);
+    }
+  }
+
+  /**
+   * Looks up HunspellWords that match the String created from the given char array, offset and length
+   *
+   * @param word Char array to generate the String from
+   * @param offset Offset in the char array that the String starts at
+   * @param length Length from the offset that the String is
+   * @return List of HunspellWords that match the generated String, or {@code null} if none are found
+   */
+  public List<HunspellWord> lookupWord(char word[], int offset, int length) {
+    return words.get(word, offset, length);
+  }
+
+  /**
+   * Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length
+   *
+   * @param word Char array to generate the String from
+   * @param offset Offset in the char array that the String starts at
+   * @param length Length from the offset that the String is
+   * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
+   */
+  public List<HunspellAffix> lookupPrefix(char word[], int offset, int length) {
+    return prefixes.get(word, offset, length);
+  }
+
+  /**
+   * Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length
+   *
+   * @param word Char array to generate the String from
+   * @param offset Offset in the char array that the String starts at
+   * @param length Length from the offset that the String is
+   * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
+   */
+  public List<HunspellAffix> lookupSuffix(char word[], int offset, int length) {
+    return suffixes.get(word, offset, length);
+  }
+
+  /**
+   * Reads the affix file through the provided InputStream, building up the prefix and suffix maps
+   *
+   * @param affixStream InputStream to read the content of the affix file from
+   * @param decoder CharsetDecoder to decode the content of the file
+   * @throws IOException Can be thrown while reading from the InputStream
+   */
+  private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException {
+    prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, false);
+    suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, false);
+    
+    BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
+    String line = null;
+    while ((line = reader.readLine()) != null) {
+      if (line.startsWith(PREFIX_KEY)) {
+        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
+      } else if (line.startsWith(SUFFIX_KEY)) {
+        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
+      } else if (line.startsWith(FLAG_KEY)) {
+        // Assume that the FLAG line comes before any prefix or suffixes
+        // Store the strategy so it can be used when parsing the dic file
+        flagParsingStrategy = getFlagParsingStrategy(line);
+      }
+    }
+    reader.close();
+  }
+
+  /**
+   * Parses a specific affix rule putting the result into the provided affix map
+   * 
+   * @param affixes Map where the result of the parsing will be put
+   * @param header Header line of the affix rule
+   * @param reader BufferedReader to read the content of the rule from
+   * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
+   *                         pattern
+   * @throws IOException Can be thrown while reading the rule
+   */
+  private void parseAffix(CharArrayMap<List<HunspellAffix>> affixes,
+                          String header,
+                          BufferedReader reader,
+                          String conditionPattern) throws IOException {
+    String args[] = header.split("\\s+");
+
+    boolean crossProduct = args[2].equals("Y");
+    
+    int numLines = Integer.parseInt(args[3]);
+    for (int i = 0; i < numLines; i++) {
+      String line = reader.readLine();
+      String ruleArgs[] = line.split("\\s+");
+
+      HunspellAffix affix = new HunspellAffix();
+      
+      affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
+      affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]);
+
+      String affixArg = ruleArgs[3];
+      
+      int flagSep = affixArg.lastIndexOf('/');
+      if (flagSep != -1) {
+        char appendFlags[] = flagParsingStrategy.parseFlags(affixArg.substring(flagSep + 1));
+        Arrays.sort(appendFlags);
+        affix.setAppendFlags(appendFlags);
+        affix.setAppend(affixArg.substring(0, flagSep));
+      } else {
+        affix.setAppend(affixArg);
+      }
+
+      String condition = ruleArgs[4];
+      affix.setCondition(condition, String.format(conditionPattern, condition));
+      affix.setCrossProduct(crossProduct);
+      
+      List<HunspellAffix> list = affixes.get(affix.getAppend());
+      if (list == null) {
+        list = new ArrayList<HunspellAffix>();
+        affixes.put(affix.getAppend(), list);
+      }
+      
+      list.add(affix);
+    }
+  }
+
+  /**
+   * Parses the encoding specificed in the affix file readable through the provided InputStream
+   *
+   * @param affix InputStream for reading the affix file
+   * @return Encoding specified in the affix file
+   * @throws IOException Can be thrown while reading from the InputStream
+   * @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET <encoding>}
+   */
+  private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
+    final StringBuilder encoding = new StringBuilder();
+    for (;;) {
+      encoding.setLength(0);
+      int ch;
+      while ((ch = affix.read()) >= 0) {
+        if (ch == '\n') {
+          break;
+        }
+        if (ch != '\r') {
+          encoding.append((char)ch);
+        }
+      }
+      if (
+          encoding.length() == 0 || encoding.charAt(0) == '#' ||
+          // this test only at the end as ineffective but would allow lines only containing spaces:
+          encoding.toString().trim().length() == 0
+      ) {
+        if (ch < 0) {
+          throw new ParseException("Unexpected end of affix file.", 0);
+        }
+        continue;
+      }
+      if ("SET ".equals(encoding.substring(0, 4))) {
+        // cleanup the encoding string, too (whitespace)
+        return encoding.substring(4).trim();
+      }
+      throw new ParseException("The first non-comment line in the affix file must "+
+          "be a 'SET charset', was: '" + encoding +"'", 0);
+    }
+  }
+
+  /**
+   * Retrieves the CharsetDecoder for the given encoding.  Note, This isn't perfect as I think ISCII-DEVANAGARI and
+   * MICROSOFT-CP1251 etc are allowed...
+   *
+   * @param encoding Encoding to retrieve the CharsetDecoder for
+   * @return CharSetDecoder for the given encoding
+   */
+  private CharsetDecoder getJavaEncoding(String encoding) {
+    Charset charset = Charset.forName(encoding);
+    return charset.newDecoder();
+  }
+
+  /**
+   * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definiton line taken from the affix file
+   *
+   * @param flagLine Line containing the flag information
+   * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definiton
+   */
+  private FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
+    String flagType = flagLine.substring(5);
+
+    if (NUM_FLAG_TYPE.equals(flagType)) {
+      return new NumFlagParsingStrategy();
+    } else if (UTF8_FLAG_TYPE.equals(flagType)) {
+      return new SimpleFlagParsingStrategy();
+    } else if (LONG_FLAG_TYPE.equals(flagType)) {
+      return new DoubleASCIIFlagParsingStrategy();
+    }
+
+    throw new IllegalArgumentException("Unknown flag type: " + flagType);
+  }
+
+  /**
+   * Reads the dictionary file through the provided InputStream, building up the words map
+   *
+   * @param dictionary InputStream to read the dictionary file through
+   * @param decoder CharsetDecoder used to decode the contents of the file
+   * @throws IOException Can be thrown while reading from the file
+   */
+  private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder) throws IOException {
+    BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder));
+    // nocommit, don't create millions of strings.
+    String line = reader.readLine(); // first line is number of entries
+    int numEntries = Integer.parseInt(line);
+    
+    // nocommit, the flags themselves can be double-chars (long) or also numeric
+    // either way the trick is to encode them as char... but they must be parsed differently
+    while ((line = reader.readLine()) != null) {
+      String entry;
+      HunspellWord wordForm;
+      
+      int flagSep = line.lastIndexOf('/');
+      if (flagSep == -1) {
+        wordForm = NOFLAGS;
+        entry = line;
+      } else {
+        // note, there can be comments (morph description) after a flag.
+        // we should really look for any whitespace
+        int end = line.indexOf('\t', flagSep);
+        if (end == -1)
+          end = line.length();
+        
+        
+        wordForm = new HunspellWord(flagParsingStrategy.parseFlags(line.substring(flagSep + 1, end)));
+        Arrays.sort(wordForm.getFlags());
+        entry = line.substring(0, flagSep);
+      }
+      
+      List<HunspellWord> entries = words.get(entry);
+      if (entries == null) {
+        entries = new ArrayList<HunspellWord>();
+        words.put(entry, entries);
+      }
+      entries.add(wordForm);
+    }
+  }
+
+  public Version getVersion() {
+    return version;
+  }
+
+  /**
+   * Abstraction of the process of parsing flags taken from the affix and dic files
+   */
+  private static abstract class FlagParsingStrategy {
+
+    /**
+     * Parses the given String into a single flag
+     *
+     * @param rawFlag String to parse into a flag
+     * @return Parsed flag
+     */
+    char parseFlag(String rawFlag) {
+      return parseFlags(rawFlag)[0];
+    }
+
+    /**
+     * Parses the given String into multiple flags
+     *
+     * @param rawFlags String to parse into flags
+     * @return Parsed flags
+     */
+    abstract char[] parseFlags(String rawFlags);
+  }
+
+  /**
+   * Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags.
+   * Can be used with both the ASCII and UTF-8 flag types.
+   */
+  private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
+    /**
+     * {@inheritDoc}
+     */
+    public char[] parseFlags(String rawFlags) {
+      return rawFlags.toCharArray();
+    }
+  }
+
+  /**
+   * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form.  In the case
+   * of multiple flags, each number is separated by a comma.
+   */
+  private static class NumFlagParsingStrategy extends FlagParsingStrategy {
+    /**
+     * {@inheritDoc}
+     */
+    public char[] parseFlags(String rawFlags) {
+      String[] rawFlagParts = rawFlags.trim().split(",");
+      char[] flags = new char[rawFlagParts.length];
+
+      for (int i = 0; i < rawFlagParts.length; i++) {
+        // note, removing the trailing X/leading I for nepali... what is the rule here?! 
+        flags[i] = (char) Integer.parseInt(rawFlagParts[i].replaceAll("[^0-9]", ""));
+      }
+
+      return flags;
+    }
+  }
+
+  /**
+   * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes
+   * must be combined into a single character.
+   *
+   * TODO (rmuir) test
+   */
+  private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {
+
+    /**
+     * {@inheritDoc}
+     */
+    public char[] parseFlags(String rawFlags) {
+      if (rawFlags.length() == 0) {
+        return new char[0];
+      }
+
+      StringBuilder builder = new StringBuilder();
+      for (int i = 0; i < rawFlags.length(); i+=2) {
+        char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1));
+        builder.append(cookedFlag);
+      }
+      
+      char flags[] = new char[builder.length()];
+      builder.getChars(0, builder.length(), flags, 0);
+      return flags;
+    }
+  }
+}
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java
@ -0,0 +1,112 @@
+package org.apache.lucene.analysis.hunspell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.hunspell.HunspellStemmer.Stem;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+/**
+ * TokenFilter that uses hunspell affix rules and words to stem tokens.  Since hunspell supports a word having multiple
+ * stems, this filter can emit multiple tokens for each consumed token
+ */
+public final class HunspellStemFilter extends TokenFilter {
+  
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+  private final HunspellStemmer stemmer;
+  
+  private List<Stem> buffer;
+  private State savedState;
+  
+  private final boolean dedup;
+
+  /**
+   * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
+   * HunspellDictionary
+   *
+   * @param input TokenStream whose tokens will be stemmed
+   * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
+   */
+  public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) {
+    this(input, dictionary, true);
+  }
+  
+  /**
+   * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
+   * HunspellDictionary
+   *
+   * @param input TokenStream whose tokens will be stemmed
+   * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
+   * @param dedup true if only unique terms should be output.
+   */
+  public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) {
+    super(input);
+    this.dedup = dedup;
+    this.stemmer = new HunspellStemmer(dictionary);
+  }
+
+  /**
+   * {@inheritDoc}
+   */
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (buffer != null && !buffer.isEmpty()) {
+      Stem nextStem = buffer.remove(0);
+      restoreState(savedState);
+      posIncAtt.setPositionIncrement(0);
+      termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength());
+      termAtt.setLength(nextStem.getStemLength());
+      return true;
+    }
+    
+    if (!input.incrementToken()) {
+      return false;
+    }
+    
+    buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length());
+
+    if (buffer.isEmpty()) { // we do not know this word, return it unchanged
+      return true;
+    }     
+
+    Stem stem = buffer.remove(0);
+    termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength());
+    termAtt.setLength(stem.getStemLength());
+
+    if (!buffer.isEmpty()) {
+      savedState = captureState();
+    }
+
+    return true;
+  }
+
+  /**
+   * {@inheritDoc}
+   */
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    buffer = null;
+  }
+}
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
@ -0,0 +1,372 @@
+package org.apache.lucene.analysis.hunspell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.ParseException;
+import java.util.*;
+
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
+
+/**
+ * HunspellStemmer uses the affix rules declared in the HunspellDictionary to generate one or more stems for a word.  It
+ * conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
+ */
+public class HunspellStemmer {
+
+  private static final int RECURSION_CAP = 2;
+  
+  private final HunspellDictionary dictionary;
+  private final StringBuilder segment = new StringBuilder();
+
+  /**
+   * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
+   *
+   * @param dictionary HunspellDictionary that will be used to create the stems
+   */
+  public HunspellStemmer(HunspellDictionary dictionary) {
+    this.dictionary = dictionary;
+  }
+
+  /**
+   * Find the stem(s) of the provided word
+   * 
+   * @param word Word to find the stems for
+   * @return List of stems for the word
+   */
+  public List<Stem> stem(String word) {
+    return stem(word.toCharArray(), word.length());
+  }
+
+  /**
+   * Find the stem(s) of the provided word
+   * 
+   * @param word Word to find the stems for
+   * @return List of stems for the word
+   */
+  public List<Stem> stem(char word[], int length) {
+    List<Stem> stems = new ArrayList<Stem>();
+    if (dictionary.lookupWord(word, 0, length) != null) {
+      stems.add(new Stem(word, length));
+    }
+    stems.addAll(stem(word, length, null, 0));
+    return stems;
+  }
+  
+  /**
+   * Find the unique stem(s) of the provided word
+   * 
+   * @param word Word to find the stems for
+   * @return List of stems for the word
+   */
+  public List<Stem> uniqueStems(char word[], int length) {
+    List<Stem> stems = new ArrayList<Stem>();
+    CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, false);
+    if (dictionary.lookupWord(word, 0, length) != null) {
+      stems.add(new Stem(word, length));
+      terms.add(word);
+    }
+    List<Stem> otherStems = stem(word, length, null, 0);
+    for (Stem s : otherStems) {
+      if (!terms.contains(s.stem)) {
+        stems.add(s);
+        terms.add(s.stem);
+      }
+    }
+    return stems;
+  }
+
+  // ================================================= Helper Methods ================================================
+
+  /**
+   * Generates a list of stems for the provided word
+   *
+   * @param word Word to generate the stems for
+   * @param flags Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step
+   * @param recursionDepth Level of recursion this stemming step is at
+   * @return List of stems, pr an empty if no stems are found
+   */
+  private List<Stem> stem(char word[], int length, char[] flags, int recursionDepth) {
+    List<Stem> stems = new ArrayList<Stem>();
+
+    for (int i = 0; i < length; i++) {
+      List<HunspellAffix> suffixes = dictionary.lookupSuffix(word, i, length - i);
+      if (suffixes == null) {
+        continue;
+      }
+
+      for (HunspellAffix suffix : suffixes) {
+        if (hasCrossCheckedFlag(suffix.getFlag(), flags)) {
+          int deAffixedLength = length - suffix.getAppend().length();
+          // TODO: can we do this in-place?
+          String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();
+
+          List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
+          for (Stem stem : stemList) {
+            stem.addSuffix(suffix);
+          }
+
+          stems.addAll(stemList);
+        }
+      }
+    }
+
+    for (int i = length - 1; i >= 0; i--) {
+      List<HunspellAffix> prefixes = dictionary.lookupPrefix(word, 0, i);
+      if (prefixes == null) {
+        continue;
+      }
+
+      for (HunspellAffix prefix : prefixes) {
+        if (hasCrossCheckedFlag(prefix.getFlag(), flags)) {
+          int deAffixedStart = prefix.getAppend().length();
+          int deAffixedLength = length - deAffixedStart;
+
+          String strippedWord = new StringBuilder().append(prefix.getStrip())
+              .append(word, deAffixedStart, deAffixedLength)
+              .toString();
+
+          List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
+          for (Stem stem : stemList) {
+            stem.addPrefix(prefix);
+          }
+
+          stems.addAll(stemList);
+        }
+      }
+    }
+
+    return stems;
+  }
+
+  /**
+   * Applies the affix rule to the given word, producing a list of stems if any are found
+   *
+   * @param strippedWord Word the affix has been removed and the strip added
+   * @param affix HunspellAffix representing the affix rule itself
+   * @param recursionDepth Level of recursion this stemming step is at
+   * @return List of stems for the word, or an empty list if none are found
+   */
+  @SuppressWarnings("unchecked")
+  public List<Stem> applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) {
+    segment.setLength(0);
+    segment.append(strippedWord, 0, length);
+    if (!affix.checkCondition(segment)) {
+      return Collections.EMPTY_LIST;
+    }
+
+    List<Stem> stems = new ArrayList<Stem>();
+    
+    List<HunspellWord> words = dictionary.lookupWord(strippedWord, 0, length);
+    if (words != null) {
+      for (HunspellWord hunspellWord : words) {
+        if (hunspellWord.hasFlag(affix.getFlag())) {
+          stems.add(new Stem(strippedWord, length));
+        }
+      }
+    }
+
+    if (affix.isCrossProduct() && recursionDepth < RECURSION_CAP) {
+      stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth));
+    }
+
+    return stems;
+  }
+
+  /**
+   * Checks if the given flag cross checks with the given array of flags
+   *
+   * @param flag Flag to cross check with the array of flags
+   * @param flags Array of flags to cross check against.  Can be {@code null}
+   * @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise
+   */
+  private boolean hasCrossCheckedFlag(char flag, char[] flags) {
+    return flags == null || Arrays.binarySearch(flags, flag) >= 0;
+  }
+
+  /**
+   * Stem represents all information known about a stem of a word.  This includes the stem, and the prefixes and suffixes
+   * that were used to change the word into the stem.
+   */
+  public static class Stem {
+
+    private final List<HunspellAffix> prefixes = new ArrayList<HunspellAffix>();
+    private final List<HunspellAffix> suffixes = new ArrayList<HunspellAffix>();
+    private final char stem[];
+    private final int stemLength;
+
+    /**
+     * Creates a new Stem wrapping the given word stem
+     *
+     * @param stem Stem of a word
+     */
+    public Stem(char stem[], int stemLength) {
+      this.stem = stem;
+      this.stemLength = stemLength;
+    }
+
+    /**
+     * Adds a prefix to the list of prefixes used to generate this stem.  Because it is assumed that prefixes are added
+     * depth first, the prefix is added to the front of the list
+     *
+     * @param prefix Prefix to add to the list of prefixes for this stem
+     */
+    public void addPrefix(HunspellAffix prefix) {
+      prefixes.add(0, prefix);
+    }
+
+    /**
+     * Adds a suffix to the list of suffixes used to generate this stem.  Because it is assumed that suffixes are added
+     * depth first, the suffix is added to the end of the list
+     *
+     * @param suffix Suffix to add to the list of suffixes for this stem
+     */
+    public void addSuffix(HunspellAffix suffix) {
+      suffixes.add(suffix);
+    }
+
+    /**
+     * Returns the list of prefixes used to generate the stem
+     *
+     * @return List of prefixes used to generate the stem or an empty list if no prefixes were required
+     */
+    public List<HunspellAffix> getPrefixes() {
+      return prefixes;
+    }
+
+    /**
+     * Returns the list of suffixes used to generate the stem
+     *
+     * @return List of suffixes used to generate the stem or an empty list if no suffixes were required
+     */
+    public List<HunspellAffix> getSuffixes() {
+      return suffixes;
+    }
+
+    /**
+     * Returns the actual word stem itself
+     *
+     * @return Word stem itself
+     */
+    public char[] getStem() {
+      return stem;
+    }
+
+    /**
+     * @return the stemLength
+     */
+    public int getStemLength() {
+      return stemLength;
+    }
+    
+    public String getStemString() {
+      return new String(stem, 0, stemLength);
+    }
+    
+  }
+
+
+  // ================================================= Entry Point ===================================================
+
+  /**
+   * HunspellStemmer entry point.  Accepts two arguments: location of affix file and location of dic file
+   *
+   * @param args Program arguments.  Should contain location of affix file and location of dic file
+   * @throws IOException Can be thrown while reading from the files
+   * @throws ParseException Can be thrown while parsing the files
+   */
+  public static void main(String[] args) throws IOException, ParseException {
+    if (args.length != 2) {
+      System.out.println("usage: HunspellStemmer <affix location> <dic location>");
+      System.exit(1);
+    }
+
+    InputStream affixInputStream = new FileInputStream(args[0]);
+    InputStream dicInputStream = new FileInputStream(args[1]);
+
+    HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_40);
+
+    affixInputStream.close();
+    dicInputStream.close();
+    
+    HunspellStemmer stemmer = new HunspellStemmer(dictionary);
+
+    Scanner scanner = new Scanner(System.in);
+    
+    System.out.print("> ");
+    while (scanner.hasNextLine()) {
+      String word = scanner.nextLine();
+      
+      if ("exit".equals(word)) {
+        break;
+      }
+
+      printStemResults(word, stemmer.stem(word.toCharArray(), word.length()));
+      
+      System.out.print("> ");
+    }
+  }
+
+  /**
+   * Prints the results of the stemming of a word
+   *
+   * @param originalWord Word that has been stemmed
+   * @param stems Stems of the word
+   */
+  private static void printStemResults(String originalWord, List<Stem> stems) {
+    StringBuilder builder = new StringBuilder().append("stem(").append(originalWord).append(")").append("\n");
+
+    for (Stem stem : stems) {
+      builder.append("- ").append(stem.getStem()).append(": ");
+
+      for (HunspellAffix prefix : stem.getPrefixes()) {
+        builder.append(prefix.getAppend()).append("+");
+
+        if (hasText(prefix.getStrip())) {
+          builder.append(prefix.getStrip()).append("-");
+        }
+      }
+
+      builder.append(stem.getStem());
+
+      for (HunspellAffix suffix : stem.getSuffixes()) {
+        if (hasText(suffix.getStrip())) {
+          builder.append("-").append(suffix.getStrip());
+        }
+        
+        builder.append("+").append(suffix.getAppend());
+      }
+      builder.append("\n");
+    }
+
+    System.out.println(builder);
+  }
+
+  /**
+   * Simple utility to check if the given String has any text
+   *
+   * @param str String to check if it has any text
+   * @return {@code true} if the String has text, {@code false} otherwise
+   */
+  private static boolean hasText(String str) {
+    return str != null && str.length() > 0;
+  }
+}
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java
@ -0,0 +1,60 @@
+package org.apache.lucene.analysis.hunspell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Arrays;
+
+public class HunspellWord {
+  
+  private final char flags[]; // sorted, can we represent more concisely?
+
+  /**
+   * Creates a new HunspellWord with no associated flags
+   */
+  public HunspellWord() {
+    flags = null;
+  }
+
+  /**
+   * Constructs a new HunspellWord with the given flags
+   *
+   * @param flags Flags to associate with the word
+   */
+  public HunspellWord(char[] flags) {
+    this.flags = flags;
+  }
+
+  /**
+   * Checks whether the word has the given flag associated with it
+   *
+   * @param flag Flag to check whether it is associated with the word
+   * @return {@code true} if the flag is associated, {@code false} otherwise
+   */
+  public boolean hasFlag(char flag) {
+    return flags != null && Arrays.binarySearch(flags, flag) >= 0;
+  }
+
+  /**
+   * Returns the flags associated with the word
+   *
+   * @return Flags asssociated with the word
+   */
+  public char[] getFlags() {
+    return flags;
+  }
+}
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/package.html
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/package.html
@ -0,0 +1,26 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+<html>
+<body>
+Stemming TokenFilter using a Java implementation of the <a href="http://www.ldc.upenn.edu/Catalog/docs/LDC2008T01/acta04.pdf">
+Hunspell stemming algorithm.</a>
+<p>
+Dictionaries can be found on <a href="http://wiki.services.openoffice.org/wiki/Dictionaries">
+OpenOffice's wiki</a>
+</p>
+</body>
+</html>
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
@ -0,0 +1,44 @@
+package org.apache.lucene.analysis.hunspell;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.Version;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.ParseException;
+
+import static junit.framework.Assert.assertEquals;
+
+public class HunspellDictionaryTest {
+
+  @Test
+  public void testHunspellDictionary_loadDicAff() throws IOException, ParseException {
+    InputStream affixStream = getClass().getResourceAsStream("test.aff");
+    InputStream dictStream = getClass().getResourceAsStream("test.dic");
+
+    HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40);
+    assertEquals(2, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
+    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
+    assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
+
+    affixStream.close();
+    dictStream.close();
+  }
+}
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java
@ -0,0 +1,76 @@
+package org.apache.lucene.analysis.hunspell;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.Version;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.ParseException;
+import java.util.List;
+
+import static junit.framework.Assert.assertEquals;
+
+public class HunspellStemmerTest {
+
+  private static HunspellStemmer stemmer;
+
+  @BeforeClass
+  public static void beforeClass() throws IOException, ParseException {
+    InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
+    InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
+
+    HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40);
+    stemmer = new HunspellStemmer(dictionary);
+
+    affixStream.close();
+    dictStream.close();
+  }
+
+  @Test
+  public void testStem_simpleSuffix() {
+    List<HunspellStemmer.Stem> stems = stemmer.stem("lucene");
+
+    assertEquals(2, stems.size());
+    assertEquals("lucene", stems.get(0).getStemString());
+    assertEquals("lucen", stems.get(1).getStemString());
+
+    stems = stemmer.stem("mahoute");
+    assertEquals(1, stems.size());
+    assertEquals("mahout", stems.get(0).getStemString());
+  }
+
+  @Test
+  public void testStem_simplePrefix() {
+    List<HunspellStemmer.Stem> stems = stemmer.stem("solr");
+
+    assertEquals(1, stems.size());
+    assertEquals("olr", stems.get(0).getStemString());
+  }
+
+  @Test
+  public void testStem_recursiveSuffix() {
+    List<HunspellStemmer.Stem> stems = stemmer.stem("abcd");
+
+    assertEquals(1, stems.size());
+    assertEquals("ab", stems.get(0).getStemString());
+  }
+
+}
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff
@ -0,0 +1,13 @@
+SET UTF-8
+TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
+
+SFX A Y 2
+SFX A   0     e         n
+SFX A   0     e         t
+
+SFX C Y 2
+SFX C   0     d/C       c
+SFX C   0     c         b
+
+PFX B Y 1
+PFX B   0     s         o
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic
@ -0,0 +1,6 @@
+5
+lucen/A
+lucene
+mahout/A
+olr/B
+ab/C