From 6a4e1e3a9262913c0284123503d361e599009534 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Mon, 24 Feb 2014 04:40:07 +0000
Subject: [PATCH 01/17] create branch

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571136 13f79535-47bb-0310-9956-ffa450edef68

From 2e0fc562bc239ea897023796160a8870eddd2a48 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Mon, 24 Feb 2014 04:41:03 +0000
Subject: [PATCH 02/17] LUCENE-5468: commit current state

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571137 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene/analysis/hunspell2/Affix.java      | 157 +++++
 .../lucene/analysis/hunspell2/Dictionary.java | 606 ++++++++++++++++++
 .../hunspell2/Hunspell2StemFilter.java        | 139 ++++
 .../hunspell2/Hunspell2StemFilterFactory.java |  80 +++
 .../analysis/hunspell2/ISO8859_14Decoder.java |  60 ++
 .../lucene/analysis/hunspell2/Stemmer.java    | 288 +++++++++
 .../lucene/analysis/hunspell2/package.html    |  26 +
 ...he.lucene.analysis.util.TokenFilterFactory |   1 +
 .../hunspell2/TestAllDictionaries.java        | 205 ++++++
 .../analysis/hunspell2/TestDictionary.java    | 109 ++++
 .../hunspell2/TestHunspell2StemFilter.java    |  87 +++
 .../TestHunspell2StemFilterFactory.java       |  50 ++
 .../analysis/hunspell2/TestStemmer.java       | 105 +++
 .../lucene/analysis/hunspell2/broken.aff      |  24 +
 .../lucene/analysis/hunspell2/compressed.aff  |  29 +
 .../lucene/analysis/hunspell2/compressed.dic  |   9 +
 .../lucene/analysis/hunspell2/simple.aff      |  20 +
 .../lucene/analysis/hunspell2/simple.dic      |  10 +
 18 files changed, 2005 insertions(+)
 create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
 create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
 create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java
 create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java
 create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java
 create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
 create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
new file mode 100644
index 00000000000..41c3553fb77
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
@@ -0,0 +1,157 @@
+package org.apache.lucene.analysis.hunspell2;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.regex.Pattern;
+
+/**
+ * Wrapper class representing a hunspell affix
+ */
+final class Affix {
+
+  private String append; // the affix itself, what is appended
+  private char appendFlags[]; // continuation class flags
+  private String strip;
+  
+  private String condition;
+  private Pattern conditionPattern;
+  
+  private char flag;
+
+  private boolean crossProduct;
+
+  /**
+   * Checks whether the given text matches the conditional pattern on this affix
+   *
+   * @param text Text to check if it matches the affix's conditional pattern
+   * @return {@code true} if the text meets the condition, {@code false} otherwise
+   */
+  public boolean checkCondition(CharSequence text) {
+    return conditionPattern.matcher(text).matches();
+  }
+
+  /**
+   * Returns the append defined for the affix
+   *
+   * @return Defined append
+   */
+  public String getAppend() {
+    return append;
+  }
+
+  /**
+   * Sets the append defined for the affix
+   *
+   * @param append Defined append for the affix
+   */
+  public void setAppend(String append) {
+    this.append = append;
+  }
+
+  /**
+   * Returns the flags defined for the affix append
+   *
+   * @return Flags defined for the affix append
+   */
+  public char[] getAppendFlags() {
+    return appendFlags;
+  }
+
+  /**
+   * Sets the flags defined for the affix append
+   *
+   * @param appendFlags Flags defined for the affix append
+   */
+  public void setAppendFlags(char[] appendFlags) {
+    this.appendFlags = appendFlags;
+  }
+
+  /**
+   * Returns the stripping characters defined for the affix
+   *
+   * @return Stripping characters defined for the affix
+   */
+  public String getStrip() {
+    return strip;
+  }
+
+  /**
+   * Sets the stripping characters defined for the affix
+   *
+   * @param strip Stripping characters defined for the affix
+   */
+  public void setStrip(String strip) {
+    this.strip = strip;
+  }
+
+  /**
+   * Returns the condition that must be met before the affix can be applied
+   *
+   * @return Condition that must be met before the affix can be applied
+   */
+  public String getCondition() {
+    return condition;
+  }
+
+  /**
+   * Sets the condition that must be met before the affix can be applied
+   *
+   * @param condition Condition to be met before affix application
+   * @param pattern Condition as a regular expression pattern
+   */
+  public void setCondition(String condition, String pattern) {
+    this.condition = condition;
+    this.conditionPattern = Pattern.compile(pattern);
+  }
+
+  /**
+   * Returns the affix flag
+   *
+   * @return Affix flag
+   */
+  public char getFlag() {
+    return flag;
+  }
+
+  /**
+   * Sets the affix flag
+   *
+   * @param flag Affix flag
+   */
+  public void setFlag(char flag) {
+    this.flag = flag;
+  }
+
+  /**
+   * Returns whether the affix is defined as cross product
+   *
+   * @return {@code true} if the affix is cross product, {@code false} otherwise
+   */
+  public boolean isCrossProduct() {
+    return crossProduct;
+  }
+
+  /**
+   * Sets whether the affix is defined as cross product
+   *
+   * @param crossProduct Whether the affix is defined as cross product
+   */
+  public void setCrossProduct(boolean crossProduct) {
+    this.crossProduct = crossProduct;
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
new file mode 100644
index 00000000000..a7b9a58f080
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
@@ -0,0 +1,606 @@
+package org.apache.lucene.analysis.hunspell2;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.util.CharArrayMap;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefHash;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.Version;
+import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.PositiveIntOutputs;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.text.ParseException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.TreeMap;
+
+/**
+ * In-memory structure for the dictionary (.dic) and affix (.aff)
+ * data of a hunspell dictionary.
+ */
+public class Dictionary {
+
+  static final char[] NOFLAGS = new char[0];
+  
+  private static final String ALIAS_KEY = "AF";
+  private static final String PREFIX_KEY = "PFX";
+  private static final String SUFFIX_KEY = "SFX";
+  private static final String FLAG_KEY = "FLAG";
+
+  private static final String NUM_FLAG_TYPE = "num";
+  private static final String UTF8_FLAG_TYPE = "UTF-8";
+  private static final String LONG_FLAG_TYPE = "long";
+  
+  private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
+  private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
+
+  public CharArrayMap<List<Affix>> prefixes;
+  public CharArrayMap<List<Affix>> suffixes;
+  
+  // the entries in the .dic file, mapping to their set of flags.
+  // the fst output is the ordinal for flagLookup
+  public FST<Long> words;
+  // the list of unique flagsets (wordforms). theoretically huge, but practically
+  // small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either.
+  public BytesRefHash flagLookup = new BytesRefHash();
+
+  private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
+
+  private String[] aliases;
+  private int aliasCount = 0;
+
+  /**
+   * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
+   * and dictionary files.
+   * You have to close the provided InputStreams yourself.
+   *
+   * @param affix InputStream for reading the hunspell affix file (won't be closed).
+   * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
+   * @throws IOException Can be thrown while reading from the InputStreams
+   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
+   */
+  public Dictionary(InputStream affix, InputStream dictionary) throws IOException, ParseException {
+    BufferedInputStream buffered = new BufferedInputStream(affix, 8192);
+    buffered.mark(8192);
+    String encoding = getDictionaryEncoding(affix);
+    buffered.reset();
+    CharsetDecoder decoder = getJavaEncoding(encoding);
+    readAffixFile(buffered, decoder);
+    TreeMap<BytesRef,Integer> tempWords = new TreeMap<BytesRef,Integer>();
+    flagLookup.add(new BytesRef()); // no flags -> ord 0
+    readDictionaryFile(dictionary, decoder, tempWords);
+    PositiveIntOutputs o = PositiveIntOutputs.getSingleton();
+    Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE4, o); // nocommit: byte4
+    IntsRef scratchInts = new IntsRef();
+    for (Map.Entry<BytesRef,Integer> e : tempWords.entrySet()) {
+      UnicodeUtil.UTF8toUTF32(e.getKey(), scratchInts);
+      b.add(scratchInts, e.getValue().longValue());
+    }
+    words = b.finish();
+  }
+
+  /**
+   * Looks up words that match the String created from the given char array, offset and length
+   *
+   * @param word Char array to generate the String from
+   * @param offset Offset in the char array that the String starts at
+   * @param length Length from the offset that the String is
+   * @return List of HunspellWords that match the generated String, or {@code null} if none are found
+   */
+  char[] lookupWord(char word[], int offset, int length, BytesRef scratch) {
+    Integer ord = null;
+    try {
+      ord = lookupOrd(word, offset, length);
+    } catch (IOException ex) { /* bogus */ }
+    if (ord == null) {
+      return null;
+    }
+    return decodeFlags(flagLookup.get(ord, scratch));
+  }
+  
+  public Integer lookupOrd(char word[], int offset, int length) throws IOException {
+    final FST.BytesReader bytesReader = words.getBytesReader();
+    final FST.Arc<Long> arc = words.getFirstArc(new FST.Arc<Long>());
+    // Accumulate output as we go
+    final Long NO_OUTPUT = words.outputs.getNoOutput();
+    Long output = NO_OUTPUT;
+    
+    int l = offset + length;
+    for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
+      cp = Character.codePointAt(word, i, l);
+      if (words.findTargetArc(cp, arc, arc, bytesReader) == null) {
+        return null;
+      } else if (arc.output != NO_OUTPUT) {
+        output = words.outputs.add(output, arc.output);
+      }
+    }
+    if (words.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
+      return null;
+    } else if (arc.output != NO_OUTPUT) {
+      return words.outputs.add(output, arc.output).intValue();
+    } else {
+      return output.intValue();
+    }
+  }
+
+  /**
+   * Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length
+   *
+   * @param word Char array to generate the String from
+   * @param offset Offset in the char array that the String starts at
+   * @param length Length from the offset that the String is
+   * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
+   */
+  public List<Affix> lookupPrefix(char word[], int offset, int length) {
+    return prefixes.get(word, offset, length);
+  }
+
+  /**
+   * Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length
+   *
+   * @param word Char array to generate the String from
+   * @param offset Offset in the char array that the String starts at
+   * @param length Length from the offset that the String is
+   * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
+   */
+  List<Affix> lookupSuffix(char word[], int offset, int length) {
+    return suffixes.get(word, offset, length);
+  }
+
+  /**
+   * Reads the affix file through the provided InputStream, building up the prefix and suffix maps
+   *
+   * @param affixStream InputStream to read the content of the affix file from
+   * @param decoder CharsetDecoder to decode the content of the file
+   * @throws IOException Can be thrown while reading from the InputStream
+   */
+  private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
+    prefixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
+    suffixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
+
+    LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
+    String line = null;
+    while ((line = reader.readLine()) != null) {
+      if (line.startsWith(ALIAS_KEY)) {
+        parseAlias(line);
+      } else if (line.startsWith(PREFIX_KEY)) {
+        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
+      } else if (line.startsWith(SUFFIX_KEY)) {
+        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
+      } else if (line.startsWith(FLAG_KEY)) {
+        // Assume that the FLAG line comes before any prefix or suffixes
+        // Store the strategy so it can be used when parsing the dic file
+        flagParsingStrategy = getFlagParsingStrategy(line);
+      }
+    }
+  }
+
+  /**
+   * Parses a specific affix rule putting the result into the provided affix map
+   * 
+   * @param affixes Map where the result of the parsing will be put
+   * @param header Header line of the affix rule
+   * @param reader BufferedReader to read the content of the rule from
+   * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
+   *                         pattern
+   * @throws IOException Can be thrown while reading the rule
+   */
+  private void parseAffix(CharArrayMap<List<Affix>> affixes,
+                          String header,
+                          LineNumberReader reader,
+                          String conditionPattern) throws IOException, ParseException {
+    String args[] = header.split("\\s+");
+
+    boolean crossProduct = args[2].equals("Y");
+    
+    int numLines = Integer.parseInt(args[3]);
+    for (int i = 0; i < numLines; i++) {
+      String line = reader.readLine();
+      String ruleArgs[] = line.split("\\s+");
+
+      if (ruleArgs.length < 5) {
+          throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber());
+      }
+
+      Affix affix = new Affix();
+      
+      affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
+      affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]);
+
+      String affixArg = ruleArgs[3];
+      
+      int flagSep = affixArg.lastIndexOf('/');
+      if (flagSep != -1) {
+        String flagPart = affixArg.substring(flagSep + 1);
+        
+        if (aliasCount > 0) {
+          flagPart = getAliasValue(Integer.parseInt(flagPart));
+        } 
+        
+        char appendFlags[] = flagParsingStrategy.parseFlags(flagPart);
+        Arrays.sort(appendFlags);
+        affix.setAppendFlags(appendFlags);
+        affix.setAppend(affixArg.substring(0, flagSep));
+      } else {
+        affix.setAppend(affixArg);
+      }
+
+      String condition = ruleArgs[4];
+      // at least the gascon affix file has this issue
+      if (condition.startsWith("[") && !condition.endsWith("]")) {
+        condition = condition + "]";
+      }
+      // "dash hasn't got special meaning" (we must escape it)
+      if (condition.indexOf('-') >= 0) {
+        condition = condition.replace("-", "\\-");
+      }
+      affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition));
+      affix.setCrossProduct(crossProduct);
+      
+      List<Affix> list = affixes.get(affix.getAppend());
+      if (list == null) {
+        list = new ArrayList<Affix>();
+        affixes.put(affix.getAppend(), list);
+      }
+      
+      list.add(affix);
+    }
+  }
+
+  /**
+   * Parses the encoding specified in the affix file readable through the provided InputStream
+   *
+   * @param affix InputStream for reading the affix file
+   * @return Encoding specified in the affix file
+   * @throws IOException Can be thrown while reading from the InputStream
+   * @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET <encoding>}
+   */
+  private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
+    final StringBuilder encoding = new StringBuilder();
+    for (;;) {
+      encoding.setLength(0);
+      int ch;
+      while ((ch = affix.read()) >= 0) {
+        if (ch == '\n') {
+          break;
+        }
+        if (ch != '\r') {
+          encoding.append((char)ch);
+        }
+      }
+      if (
+          encoding.length() == 0 || encoding.charAt(0) == '#' ||
+          // this test only at the end as ineffective but would allow lines only containing spaces:
+          encoding.toString().trim().length() == 0
+      ) {
+        if (ch < 0) {
+          throw new ParseException("Unexpected end of affix file.", 0);
+        }
+        continue;
+      }
+      if (encoding.length() > 4 && "SET ".equals(encoding.substring(0, 4))) {
+        // cleanup the encoding string, too (whitespace)
+        return encoding.substring(4).trim();
+      }
+    }
+  }
+
+  static final Map<String,String> CHARSET_ALIASES;
+  static {
+    Map<String,String> m = new HashMap<>();
+    m.put("microsoft-cp1251", "windows-1251");
+    m.put("TIS620-2533", "TIS-620");
+    CHARSET_ALIASES = Collections.unmodifiableMap(m);
+  }
+  
+  /**
+   * Retrieves the CharsetDecoder for the given encoding.  Note, This isn't perfect as I think ISCII-DEVANAGARI and
+   * MICROSOFT-CP1251 etc are allowed...
+   *
+   * @param encoding Encoding to retrieve the CharsetDecoder for
+   * @return CharSetDecoder for the given encoding
+   */
+  private CharsetDecoder getJavaEncoding(String encoding) {
+    if ("ISO8859-14".equals(encoding)) {
+      return new ISO8859_14Decoder();
+    }
+    String canon = CHARSET_ALIASES.get(encoding);
+    if (canon != null) {
+      encoding = canon;
+    }
+    Charset charset = Charset.forName(encoding);
+    return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE);
+  }
+
+  /**
+   * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file
+   *
+   * @param flagLine Line containing the flag information
+   * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition
+   */
+  private FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
+    String flagType = flagLine.substring(5);
+
+    if (NUM_FLAG_TYPE.equals(flagType)) {
+      return new NumFlagParsingStrategy();
+    } else if (UTF8_FLAG_TYPE.equals(flagType)) {
+      return new SimpleFlagParsingStrategy();
+    } else if (LONG_FLAG_TYPE.equals(flagType)) {
+      return new DoubleASCIIFlagParsingStrategy();
+    }
+
+    throw new IllegalArgumentException("Unknown flag type: " + flagType);
+  }
+
+  /**
+   * Reads the dictionary file through the provided InputStream, building up the words map
+   *
+   * @param dictionary InputStream to read the dictionary file through
+   * @param decoder CharsetDecoder used to decode the contents of the file
+   * @throws IOException Can be thrown while reading from the file
+   */
+  private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder, TreeMap<BytesRef,Integer> words) throws IOException {
+    BytesRef flagsScratch = new BytesRef();
+    BytesRef flagsScratch2 = new BytesRef();
+    
+    BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder));
+    // TODO: don't create millions of strings.
+    String line = reader.readLine(); // first line is number of entries
+    // sometimes the number of entries has a comment/copyright after it
+    line = line.replaceFirst("\\s*\\#.*$", "");
+    int numEntries = Integer.parseInt(line);
+    
+    // TODO: the flags themselves can be double-chars (long) or also numeric
+    // either way the trick is to encode them as char... but they must be parsed differently
+    while ((line = reader.readLine()) != null) {
+      String entry;
+      char wordForm[];
+      
+      int flagSep = line.lastIndexOf('/');
+      if (flagSep == -1) {
+        wordForm = NOFLAGS;
+        entry = line;
+      } else {
+        // note, there can be comments (morph description) after a flag.
+        // we should really look for any whitespace
+        int end = line.indexOf('\t', flagSep);
+        if (end == -1)
+          end = line.length();
+        
+        String flagPart = line.substring(flagSep + 1, end);
+        if (aliasCount > 0) {
+          flagPart = getAliasValue(Integer.parseInt(flagPart));
+        } 
+        
+        wordForm = flagParsingStrategy.parseFlags(flagPart);
+        Arrays.sort(wordForm);
+        entry = line.substring(0, flagSep);
+      }
+
+      BytesRef scratch = new BytesRef(entry);
+      Integer existingOrd = words.get(scratch);
+      final char mergedEntries[];
+      if (existingOrd == null || existingOrd == 0) {
+        mergedEntries = wordForm;
+      } else {
+        flagLookup.get(existingOrd, flagsScratch2);
+        mergedEntries = merge(decodeFlags(flagsScratch2), wordForm);
+      }
+
+      final int hashCode = encodeFlagsWithHash(flagsScratch, mergedEntries);
+      int ord = flagLookup.add(flagsScratch, hashCode);
+      if (ord < 0) {
+        // already exists in our hash
+        ord = (-ord)-1;
+      }
+      
+      words.put(scratch, ord);
+    }
+  }
+  
+  static char[] decodeFlags(BytesRef b) {
+    int len = b.length >>> 1;
+    char flags[] = new char[len];
+    int upto = 0;
+    int end = b.offset + b.length;
+    for (int i = b.offset; i < end; i += 2) {
+      flags[upto++] = (char)((b.bytes[i] << 8) | (b.bytes[i+1] & 0xff));
+    }
+    return flags;
+  }
+  
+  static int encodeFlagsWithHash(BytesRef b, char flags[]) {
+    int hash = 0;
+    int len = flags.length << 1;
+    b.grow(len);
+    b.length = len;
+    int upto = b.offset;
+    for (int i = 0; i < flags.length; i++) {
+      int flag = flags[i];
+      hash = 31*hash + (b.bytes[upto++] = (byte) ((flag >> 8) & 0xff));
+      hash = 31*hash + (b.bytes[upto++] = (byte) (flag & 0xff));
+    }
+    return hash;
+  }
+
+  private void parseAlias(String line) {
+    String ruleArgs[] = line.split("\\s+");
+    if (aliases == null) {
+      //first line should be the aliases count
+      final int count = Integer.parseInt(ruleArgs[1]);
+      aliases = new String[count];
+    } else {
+      aliases[aliasCount++] = ruleArgs[1];
+    }
+  }
+  
+  private String getAliasValue(int id) {
+    try {
+      return aliases[id - 1];
+    } catch (IndexOutOfBoundsException ex) {
+      throw new IllegalArgumentException("Bad flag alias number:" + id, ex);
+    }
+  }
+
+  /**
+   * Abstraction of the process of parsing flags taken from the affix and dic files
+   */
+  private static abstract class FlagParsingStrategy {
+
+    /**
+     * Parses the given String into a single flag
+     *
+     * @param rawFlag String to parse into a flag
+     * @return Parsed flag
+     */
+    char parseFlag(String rawFlag) {
+      return parseFlags(rawFlag)[0];
+    }
+
+    /**
+     * Parses the given String into multiple flags
+     *
+     * @param rawFlags String to parse into flags
+     * @return Parsed flags
+     */
+    abstract char[] parseFlags(String rawFlags);
+  }
+
+  /**
+   * Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags.
+   * Can be used with both the ASCII and UTF-8 flag types.
+   */
+  private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
+    @Override
+    public char[] parseFlags(String rawFlags) {
+      return rawFlags.toCharArray();
+    }
+  }
+
+  /**
+   * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form.  In the case
+   * of multiple flags, each number is separated by a comma.
+   */
+  private static class NumFlagParsingStrategy extends FlagParsingStrategy {
+    @Override
+    public char[] parseFlags(String rawFlags) {
+      String[] rawFlagParts = rawFlags.trim().split(",");
+      char[] flags = new char[rawFlagParts.length];
+      int upto = 0;
+      
+      for (int i = 0; i < rawFlagParts.length; i++) {
+        // note, removing the trailing X/leading I for nepali... what is the rule here?! 
+        String replacement = rawFlagParts[i].replaceAll("[^0-9]", "");
+        // note, ignoring empty flags (this happens in danish, for example)
+        if (replacement.isEmpty()) {
+          continue;
+        }
+        flags[upto++] = (char) Integer.parseInt(replacement);
+      }
+
+      if (upto < flags.length) {
+        flags = Arrays.copyOf(flags, upto);
+      }
+      return flags;
+    }
+  }
+
+  /**
+   * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes
+   * must be combined into a single character.
+   *
+   * TODO (rmuir) test
+   */
+  private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {
+
+    @Override
+    public char[] parseFlags(String rawFlags) {
+      if (rawFlags.length() == 0) {
+        return new char[0];
+      }
+
+      StringBuilder builder = new StringBuilder();
+      for (int i = 0; i < rawFlags.length(); i+=2) {
+        char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1));
+        builder.append(cookedFlag);
+      }
+      
+      char flags[] = new char[builder.length()];
+      builder.getChars(0, builder.length(), flags, 0);
+      return flags;
+    }
+  }
+  
+  static boolean hasFlag(char flags[], char flag) {
+    return Arrays.binarySearch(flags, flag) >= 0;
+  }
+  
+  static char[] merge(char[] flags1, char[] flags2) {
+    char merged[] = new char[flags1.length + flags2.length];
+    int i1 = 0, i2 = 0;
+    int last = -1;
+    int upto = 0;
+    
+    while (i1 < flags1.length && i2 < flags2.length) {
+      final char next;
+      if (flags1[i1] <= flags2[i2]) {
+        next = flags1[i1++];
+      } else {
+        next = flags2[i2++];
+      }
+      if (next != last) {
+        merged[upto++] = next;
+        last = next;
+      }
+    }
+    
+    while (i1 < flags1.length) {
+      char next = flags1[i1++];
+      if (next != last) {
+        merged[upto++] = next;
+        last = next;
+      }
+    }
+    
+    while (i2 < flags2.length) {
+      char next = flags2[i2++];
+      if (next != last) {
+        merged[upto++] = next;
+        last = next;
+      }
+    }
+    
+    if (merged.length != upto) {
+      merged = Arrays.copyOf(merged, upto);
+    }
+    
+    return merged;
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java
new file mode 100644
index 00000000000..f9dfb770ab2
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java
@@ -0,0 +1,139 @@
+package org.apache.lucene.analysis.hunspell2;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.hunspell2.Stemmer.Stem;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+/**
+ * TokenFilter that uses hunspell affix rules and words to stem tokens.  Since hunspell supports a word having multiple
+ * stems, this filter can emit multiple tokens for each consumed token
+ *
+ * <p>
+ * Note: This filter is aware of the {@link KeywordAttribute}. To prevent
+ * certain terms from being passed to the stemmer
+ * {@link KeywordAttribute#isKeyword()} should be set to <code>true</code>
+ * in a previous {@link TokenStream}.
+ *
+ * Note: For including the original term as well as the stemmed version, see
+ * {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory}
+ * </p>
+ *
+ * @lucene.experimental
+ */
+public final class Hunspell2StemFilter extends TokenFilter {
+  
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+  private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+  private final Stemmer stemmer;
+  
+  private List<Stem> buffer;
+  private State savedState;
+  
+  private final boolean dedup;
+
+  /** Create a {@link Hunspell2StemFilter} which deduplicates stems and has a maximum
+   *  recursion level of 2. 
+   *  @see #Hunspell2StemFilter(TokenStream, Dictionary, int) */
+  public Hunspell2StemFilter(TokenStream input, Dictionary dictionary) {
+    this(input, dictionary, 2);
+  }
+
+  /**
+   * Creates a new Hunspell2StemFilter that will stem tokens from the given TokenStream using affix rules in the provided
+   * Dictionary
+   *
+   * @param input TokenStream whose tokens will be stemmed
+   * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
+   * @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
+   */
+  public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, int recursionCap) {
+    this(input, dictionary, true, recursionCap);
+  }
+
+  /** Create a {@link Hunspell2StemFilter} which has a maximum recursion level of 2. 
+   *  @see #Hunspell2StemFilter(TokenStream, Dictionary, boolean, int) */
+  public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup) {
+    this(input, dictionary, dedup, 2);
+  }
+
+  /**
+   * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
+   * Dictionary
+   *
+   * @param input TokenStream whose tokens will be stemmed
+   * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
+   * @param dedup true if only unique terms should be output.
+   * @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
+   */
+  public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap) {
+    super(input);
+    this.dedup = dedup;
+    this.stemmer = new Stemmer(dictionary, recursionCap);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (buffer != null && !buffer.isEmpty()) {
+      Stem nextStem = buffer.remove(0);
+      restoreState(savedState);
+      posIncAtt.setPositionIncrement(0);
+      termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength());
+      termAtt.setLength(nextStem.getStemLength());
+      return true;
+    }
+    
+    if (!input.incrementToken()) {
+      return false;
+    }
+    
+    if (keywordAtt.isKeyword()) {
+      return true;
+    }
+    
+    buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length());
+
+    if (buffer.isEmpty()) { // we do not know this word, return it unchanged
+      return true;
+    }     
+
+    Stem stem = buffer.remove(0);
+    termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength());
+    termAtt.setLength(stem.getStemLength());
+
+    if (!buffer.isEmpty()) {
+      savedState = captureState();
+    }
+
+    return true;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    buffer = null;
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java
new file mode 100644
index 00000000000..6ce73698dfd
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java
@@ -0,0 +1,80 @@
+package org.apache.lucene.analysis.hunspell2;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.ParseException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * TokenFilterFactory that creates instances of {@link Hunspell2StemFilter}.
+ * Example config for British English:
+ * <pre class="prettyprint">
+ * &lt;filter class=&quot;solr.Hunspell2StemFilterFactory&quot;
+ *         dictionary=&quot;en_GB.dic&quot;
+ *         affix=&quot;en_GB.aff&quot; /&gt;</pre>
+ * Both parameters dictionary and affix are mandatory.
+ * Dictionaries for many languages are available through the OpenOffice project.
+ * 
+ * See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
+ * @lucene.experimental
+ */
+public class Hunspell2StemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+  private static final String PARAM_DICTIONARY    = "dictionary";
+  private static final String PARAM_AFFIX         = "affix";
+  private static final String PARAM_RECURSION_CAP = "recursionCap";
+
+  private final String dictionaryFile;
+  private final String affixFile;
+  private Dictionary dictionary;
+  private int recursionCap;
+  
+  /** Creates a new Hunspell2StemFilterFactory */
+  public Hunspell2StemFilterFactory(Map<String,String> args) {
+    super(args);
+    dictionaryFile = require(args, PARAM_DICTIONARY);
+    affixFile = get(args, PARAM_AFFIX);
+    recursionCap = getInt(args, PARAM_RECURSION_CAP, 2);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public void inform(ResourceLoader loader) throws IOException {
+    try (InputStream affix = loader.openResource(affixFile);
+        InputStream dictionary = loader.openResource(dictionaryFile)) {
+      try {
+        this.dictionary = new Dictionary(affix, dictionary);
+      } catch (ParseException e) {
+        throw new RuntimeException(e);
+      }
+    }
+  }
+
+  @Override
+  public TokenStream create(TokenStream tokenStream) {
+    return new Hunspell2StemFilter(tokenStream, dictionary, recursionCap);
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java
new file mode 100644
index 00000000000..4de0d4bc051
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java
@@ -0,0 +1,60 @@
+package org.apache.lucene.analysis.hunspell2;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CoderResult;
+
+import org.apache.lucene.util.IOUtils;
+
+// many hunspell dictionaries use this encoding, yet java does not have it?!?!
+final class ISO8859_14Decoder extends CharsetDecoder {
+  
+  static final char TABLE[] = new char[] {
+    0x00A0, 0x1E02, 0x1E03, 0x00A3, 0x010A, 0x010B, 0x1E0A, 0x00A7, 
+    0x1E80, 0x00A9, 0x1E82, 0x1E0B, 0x1EF2, 0x00AD, 0x00AE, 0x0178,
+    0x1E1E, 0x1E1F, 0x0120, 0x0121, 0x1E40, 0x1E41, 0x00B6, 0x1E56, 
+    0x1E81, 0x1E57, 0x1E83, 0x1E60, 0x1EF3, 0x1E84, 0x1E85, 0x1E61,
+    0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, 
+    0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
+    0x0174, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x1E6A, 
+    0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x0176, 0x00DF,
+    0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, 
+    0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
+    0x0175, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x1E6B, 
+    0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x0177, 0x00FF
+  };
+
+  ISO8859_14Decoder() {
+    super(IOUtils.CHARSET_UTF_8, 1f, 1f);
+  }
+
+  @Override
+  protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
+    while (in.hasRemaining() && out.hasRemaining()) {
+      char ch = (char) (in.get() & 0xff);
+      if (ch >= 0xA0) {
+        ch = TABLE[ch - 0xA0];
+      }
+      out.put(ch);
+    }
+    return in.hasRemaining() ? CoderResult.OVERFLOW : CoderResult.UNDERFLOW;
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
new file mode 100644
index 00000000000..7d36c81e4ae
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
@@ -0,0 +1,288 @@
+package org.apache.lucene.analysis.hunspell2;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.Version;
+
+/**
+ * Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word.  It
+ * conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
+ */
+final class Stemmer {
+  private final int recursionCap;
+  private final Dictionary dictionary;
+  private BytesRef scratch = new BytesRef();
+  private final StringBuilder segment = new StringBuilder();
+
+  /**
+   * Constructs a new Stemmer which will use the provided Dictionary to create its stems. Uses the 
+   * default recursion cap of <code>2</code> (based on Hunspell documentation). 
+   *
+   * @param dictionary Dictionary that will be used to create the stems
+   */
+  public Stemmer(Dictionary dictionary) {
+    this(dictionary, 2);
+  }
+
+  /**
+   * Constructs a new Stemmer which will use the provided Dictionary to create its stems.
+   *
+   * @param dictionary Dictionary that will be used to create the stems
+   * @param recursionCap maximum level of recursion stemmer can go into
+   */
+  public Stemmer(Dictionary dictionary, int recursionCap) {
+    this.dictionary = dictionary;
+    this.recursionCap = recursionCap;
+  } 
+  
+  /**
+   * Find the stem(s) of the provided word.
+   * 
+   * @param word Word to find the stems for
+   * @return List of stems for the word
+   */
+  public List<Stem> stem(String word) {
+    return stem(word.toCharArray(), word.length());
+  }
+
+  /**
+   * Find the stem(s) of the provided word
+   * 
+   * @param word Word to find the stems for
+   * @return List of stems for the word
+   */
+  public List<Stem> stem(char word[], int length) {
+    List<Stem> stems = new ArrayList<Stem>();
+    if (dictionary.lookupWord(word, 0, length, scratch) != null) {
+      stems.add(new Stem(word, length));
+    }
+    stems.addAll(stem(word, length, null, 0));
+    return stems;
+  }
+  
+  /**
+   * Find the unique stem(s) of the provided word
+   * 
+   * @param word Word to find the stems for
+   * @return List of stems for the word
+   */
+  public List<Stem> uniqueStems(char word[], int length) {
+    List<Stem> stems = new ArrayList<Stem>();
+    CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false);
+    if (dictionary.lookupWord(word, 0, length, scratch) != null) {
+      stems.add(new Stem(word, length));
+      terms.add(word);
+    }
+    List<Stem> otherStems = stem(word, length, null, 0);
+    for (Stem s : otherStems) {
+      if (!terms.contains(s.stem)) {
+        stems.add(s);
+        terms.add(s.stem);
+      }
+    }
+    return stems;
+  }
+
+  // ================================================= Helper Methods ================================================
+
+  /**
+   * Generates a list of stems for the provided word
+   *
+   * @param word Word to generate the stems for
+   * @param flags Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step
+   * @param recursionDepth Level of recursion this stemming step is at
+   * @return List of stems, or empty list if no stems are found
+   */
+  private List<Stem> stem(char word[], int length, char[] flags, int recursionDepth) {
+    List<Stem> stems = new ArrayList<Stem>();
+
+    for (int i = 0; i < length; i++) {
+      List<Affix> suffixes = dictionary.lookupSuffix(word, i, length - i);
+      if (suffixes == null) {
+        continue;
+      }
+
+      for (Affix suffix : suffixes) {
+        if (hasCrossCheckedFlag(suffix.getFlag(), flags)) {
+          int deAffixedLength = length - suffix.getAppend().length();
+          // TODO: can we do this in-place?
+          String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();
+
+          List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
+          for (Stem stem : stemList) {
+            stem.addSuffix(suffix);
+          }
+
+          stems.addAll(stemList);
+        }
+      }
+    }
+
+    for (int i = length - 1; i >= 0; i--) {
+      List<Affix> prefixes = dictionary.lookupPrefix(word, 0, i);
+      if (prefixes == null) {
+        continue;
+      }
+
+      for (Affix prefix : prefixes) {
+        if (hasCrossCheckedFlag(prefix.getFlag(), flags)) {
+          int deAffixedStart = prefix.getAppend().length();
+          int deAffixedLength = length - deAffixedStart;
+
+          String strippedWord = new StringBuilder().append(prefix.getStrip())
+              .append(word, deAffixedStart, deAffixedLength)
+              .toString();
+
+          List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
+          for (Stem stem : stemList) {
+            stem.addPrefix(prefix);
+          }
+
+          stems.addAll(stemList);
+        }
+      }
+    }
+
+    return stems;
+  }
+
+  /**
+   * Applies the affix rule to the given word, producing a list of stems if any are found
+   *
+   * @param strippedWord Word the affix has been removed and the strip added
+   * @param affix HunspellAffix representing the affix rule itself
+   * @param recursionDepth Level of recursion this stemming step is at
+   * @return List of stems for the word, or an empty list if none are found
+   */
+  public List<Stem> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
+    segment.setLength(0);
+    segment.append(strippedWord, 0, length);
+    if (!affix.checkCondition(segment)) {
+      return Collections.emptyList();
+    }
+
+    List<Stem> stems = new ArrayList<Stem>();
+
+    char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch);
+    if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) {
+      stems.add(new Stem(strippedWord, length));
+    }
+
+    if (affix.isCrossProduct() && recursionDepth < recursionCap) {
+      stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth));
+    }
+
+    return stems;
+  }
+
+  /**
+   * Checks if the given flag cross checks with the given array of flags
+   *
+   * @param flag Flag to cross check with the array of flags
+   * @param flags Array of flags to cross check against.  Can be {@code null}
+   * @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise
+   */
+  private boolean hasCrossCheckedFlag(char flag, char[] flags) {
+    return flags == null || Arrays.binarySearch(flags, flag) >= 0;
+  }
+
+  /**
+   * Stem represents all information known about a stem of a word.  This includes the stem, and the prefixes and suffixes
+   * that were used to change the word into the stem.
+   */
+  public static class Stem {
+
+    private final List<Affix> prefixes = new ArrayList<Affix>();
+    private final List<Affix> suffixes = new ArrayList<Affix>();
+    private final char stem[];
+    private final int stemLength;
+
+    /**
+     * Creates a new Stem wrapping the given word stem
+     *
+     * @param stem Stem of a word
+     */
+    public Stem(char stem[], int stemLength) {
+      this.stem = stem;
+      this.stemLength = stemLength;
+    }
+
+    /**
+     * Adds a prefix to the list of prefixes used to generate this stem.  Because it is assumed that prefixes are added
+     * depth first, the prefix is added to the front of the list
+     *
+     * @param prefix Prefix to add to the list of prefixes for this stem
+     */
+    public void addPrefix(Affix prefix) {
+      prefixes.add(0, prefix);
+    }
+
+    /**
+     * Adds a suffix to the list of suffixes used to generate this stem.  Because it is assumed that suffixes are added
+     * depth first, the suffix is added to the end of the list
+     *
+     * @param suffix Suffix to add to the list of suffixes for this stem
+     */
+    public void addSuffix(Affix suffix) {
+      suffixes.add(suffix);
+    }
+
+    /**
+     * Returns the list of prefixes used to generate the stem
+     *
+     * @return List of prefixes used to generate the stem or an empty list if no prefixes were required
+     */
+    public List<Affix> getPrefixes() {
+      return prefixes;
+    }
+
+    /**
+     * Returns the list of suffixes used to generate the stem
+     * 
+     * @return List of suffixes used to generate the stem or an empty list if no suffixes were required
+     */
+    public List<Affix> getSuffixes() {
+      return suffixes;
+    }
+
+    /**
+     * Returns the text of the word's stem.
+     * @see #getStemLength()
+     */
+    public char[] getStem() {
+      return stem;
+    }
+
+    /** Returns the valid length of the text in {@link #getStem()} */
+    public int getStemLength() {
+      return stemLength;
+    }
+    
+    /** Only use this if you really need a string (e.g. for testing) */
+    public String getStemString() {
+      return new String(stem, 0, stemLength);
+    }
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html
new file mode 100644
index 00000000000..196591969e8
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html
@@ -0,0 +1,26 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+<html>
+<body>
+Stemming TokenFilter using a Java implementation of the <a href="http://www.ldc.upenn.edu/Catalog/docs/LDC2008T01/acta04.pdf">
+Hunspell stemming algorithm.</a>
+<p>
+Dictionaries can be found on <a href="http://wiki.services.openoffice.org/wiki/Dictionaries">
+OpenOffice's wiki</a>
+</p>
+</body>
+</html>
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
index 04fc80cf59c..e4ca7c6802c 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@@ -51,6 +51,7 @@ org.apache.lucene.analysis.hi.HindiNormalizationFilterFactory
 org.apache.lucene.analysis.hi.HindiStemFilterFactory
 org.apache.lucene.analysis.hu.HungarianLightStemFilterFactory
 org.apache.lucene.analysis.hunspell.HunspellStemFilterFactory
+org.apache.lucene.analysis.hunspell2.Hunspell2StemFilterFactory
 org.apache.lucene.analysis.id.IndonesianStemFilterFactory
 org.apache.lucene.analysis.in.IndicNormalizationFilterFactory
 org.apache.lucene.analysis.it.ItalianLightStemFilterFactory
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
new file mode 100644
index 00000000000..02ccedb9be7
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
@@ -0,0 +1,205 @@
+package org.apache.lucene.analysis.hunspell2;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.InputStream;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+
+import org.apache.lucene.analysis.hunspell.HunspellDictionary;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.RamUsageEstimator;
+
+/**
+ * Can be retrieved via:
+ * wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/
+ * Note some of the files differ only in case. This may be a problem on your operating system!
+ */
+//@Ignore("enable manually")
+public class TestAllDictionaries extends LuceneTestCase {
+  
+  // set this to the location of where you downloaded all the files
+  static final File DICTIONARY_HOME = 
+      new File("/Users/rmuir/hunspell/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries");
+  
+  final String tests[] = {
+    /* zip file */               /* dictionary */       /* affix */
+    "af_ZA.zip",                 "af_ZA.dic",           "af_ZA.aff",
+    "ak_GH.zip",                 "ak_GH.dic",           "ak_GH.aff",
+    "bg_BG.zip",                 "bg_BG.dic",           "bg_BG.aff",
+    "ca_ANY.zip",                "catalan.dic",         "catalan.aff",
+    "ca_ES.zip",                 "ca_ES.dic",           "ca_ES.aff",
+    "cop_EG.zip",                "cop_EG.dic",          "cop_EG.aff",
+    "cs_CZ.zip",                 "cs_CZ.dic",           "cs_CZ.aff",
+    "cy_GB.zip",                 "cy_GB.dic",           "cy_GB.aff",
+    "da_DK.zip",                 "da_DK.dic",           "da_DK.aff",
+    "de_AT.zip",                 "de_AT.dic",           "de_AT.aff",
+    "de_CH.zip",                 "de_CH.dic",           "de_CH.aff",
+    "de_DE.zip",                 "de_DE.dic",           "de_DE.aff",
+    "de_DE_comb.zip",            "de_DE_comb.dic",      "de_DE_comb.aff",
+    "de_DE_frami.zip",           "de_DE_frami.dic",     "de_DE_frami.aff",
+    "de_DE_neu.zip",             "de_DE_neu.dic",       "de_DE_neu.aff",
+    "el_GR.zip",                 "el_GR.dic",           "el_GR.aff",
+    "en_AU.zip",                 "en_AU.dic",           "en_AU.aff",
+    "en_CA.zip",                 "en_CA.dic",           "en_CA.aff",
+    "en_GB-oed.zip",             "en_GB-oed.dic",       "en_GB-oed.aff",
+    "en_GB.zip",                 "en_GB.dic",           "en_GB.aff",
+    "en_NZ.zip",                 "en_NZ.dic",           "en_NZ.aff",
+    "eo.zip",                    "eo_l3.dic",           "eo_l3.aff",
+    "eo_EO.zip",                 "eo_EO.dic",           "eo_EO.aff",
+    "es_AR.zip",                 "es_AR.dic",           "es_AR.aff",
+    "es_BO.zip",                 "es_BO.dic",           "es_BO.aff",
+    "es_CL.zip",                 "es_CL.dic",           "es_CL.aff",
+    "es_CO.zip",                 "es_CO.dic",           "es_CO.aff",
+    "es_CR.zip",                 "es_CR.dic",           "es_CR.aff",
+    "es_CU.zip",                 "es_CU.dic",           "es_CU.aff",
+    "es_DO.zip",                 "es_DO.dic",           "es_DO.aff",
+    "es_EC.zip",                 "es_EC.dic",           "es_EC.aff",
+    "es_ES.zip",                 "es_ES.dic",           "es_ES.aff",
+    "es_GT.zip",                 "es_GT.dic",           "es_GT.aff",
+    "es_HN.zip",                 "es_HN.dic",           "es_HN.aff",
+    "es_MX.zip",                 "es_MX.dic",           "es_MX.aff",
+    "es_NEW.zip",                "es_NEW.dic",          "es_NEW.aff",
+    "es_NI.zip",                 "es_NI.dic",           "es_NI.aff",
+    "es_PA.zip",                 "es_PA.dic",           "es_PA.aff",
+    "es_PE.zip",                 "es_PE.dic",           "es_PE.aff",
+    "es_PR.zip",                 "es_PR.dic",           "es_PR.aff",
+    "es_PY.zip",                 "es_PY.dic",           "es_PY.aff",
+    "es_SV.zip",                 "es_SV.dic",           "es_SV.aff",
+    "es_UY.zip",                 "es_UY.dic",           "es_UY.aff",
+    "es_VE.zip",                 "es_VE.dic",           "es_VE.aff",
+    "et_EE.zip",                 "et_EE.dic",           "et_EE.aff",
+    "fo_FO.zip",                 "fo_FO.dic",           "fo_FO.aff",
+    "fr_FR-1990_1-3-2.zip",      "fr_FR-1990.dic",      "fr_FR-1990.aff",
+    "fr_FR-classique_1-3-2.zip", "fr_FR-classique.dic", "fr_FR-classique.aff",
+    "fr_FR_1-3-2.zip",           "fr_FR.dic",           "fr_FR.aff",
+    "fy_NL.zip",                 "fy_NL.dic",           "fy_NL.aff",
+    "ga_IE.zip",                 "ga_IE.dic",           "ga_IE.aff",
+    "gd_GB.zip",                 "gd_GB.dic",           "gd_GB.aff",
+    "gl_ES.zip",                 "gl_ES.dic",           "gl_ES.aff",
+    "gsc_FR.zip",                "gsc_FR.dic",          "gsc_FR.aff",
+    "gu_IN.zip",                 "gu_IN.dic",           "gu_IN.aff",
+    "he_IL.zip",                 "he_IL.dic",           "he_IL.aff",
+    "hi_IN.zip",                 "hi_IN.dic",           "hi_IN.aff",
+    "hil_PH.zip",                "hil_PH.dic",          "hil_PH.aff",
+    "hr_HR.zip",                 "hr_HR.dic",           "hr_HR.aff",
+    "hu_HU.zip",                 "hu_HU.dic",           "hu_HU.aff",
+    "hu_HU_comb.zip",            "hu_HU.dic",           "hu_HU.aff",
+    "ia.zip",                    "ia.dic",              "ia.aff",
+    "id_ID.zip",                 "id_ID.dic",           "id_ID.aff",
+    "it_IT.zip",                 "it_IT.dic",           "it_IT.aff",
+    "ku_TR.zip",                 "ku_TR.dic",           "ku_TR.aff",
+    "la.zip",                    "la.dic",              "la.aff",
+    "lt_LT.zip",                 "lt_LT.dic",           "lt_LT.aff",
+    "lv_LV.zip",                 "lv_LV.dic",           "lv_LV.aff",
+    "mg_MG.zip",                 "mg_MG.dic",           "mg_MG.aff",
+    "mi_NZ.zip",                 "mi_NZ.dic",           "mi_NZ.aff",
+    "mk_MK.zip",                 "mk_MK.dic",           "mk_MK.aff",
+    "mos_BF.zip",                "mos_BF.dic",          "mos_BF.aff",
+    "mr_IN.zip",                 "mr_IN.dic",           "mr_IN.aff",
+    "ms_MY.zip",                 "ms_MY.dic",           "ms_MY.aff",
+    "nb_NO.zip",                 "nb_NO.dic",           "nb_NO.aff",
+    "ne_NP.zip",                 "ne_NP.dic",           "ne_NP.aff",
+    "nl_NL.zip",                 "nl_NL.dic",           "nl_NL.aff",
+    "nl_med.zip",                "nl_med.dic",          "nl_med.aff",
+    "nn_NO.zip",                 "nn_NO.dic",           "nn_NO.aff",
+    "nr_ZA.zip",                 "nr_ZA.dic",           "nr_ZA.aff",
+    "ns_ZA.zip",                 "ns_ZA.dic",           "ns_ZA.aff",
+    "ny_MW.zip",                 "ny_MW.dic",           "ny_MW.aff",
+    "oc_FR.zip",                 "oc_FR.dic",           "oc_FR.aff",
+    "pl_PL.zip",                 "pl_PL.dic",           "pl_PL.aff",
+    "pt_BR.zip",                 "pt_BR.dic",           "pt_BR.aff",
+    "pt_PT.zip",                 "pt_PT.dic",           "pt_PT.aff",
+    "ro_RO.zip",                 "ro_RO.dic",           "ro_RO.aff",
+    "ru_RU.zip",                 "ru_RU.dic",           "ru_RU.aff",
+    "ru_RU_ye.zip",              "ru_RU_ie.dic",        "ru_RU_ie.aff",
+    "ru_RU_yo.zip",              "ru_RU_yo.dic",        "ru_RU_yo.aff",
+    "rw_RW.zip",                 "rw_RW.dic",           "rw_RW.aff",
+    "sk_SK.zip",                 "sk_SK.dic",           "sk_SK.aff",
+    "sl_SI.zip",                 "sl_SI.dic",           "sl_SI.aff",
+    "sq_AL.zip",                 "sq_AL.dic",           "sq_AL.aff",
+    "ss_ZA.zip",                 "ss_ZA.dic",           "ss_ZA.aff",
+    "st_ZA.zip",                 "st_ZA.dic",           "st_ZA.aff",
+    "sv_SE.zip",                 "sv_SE.dic",           "sv_SE.aff",
+    "sw_KE.zip",                 "sw_KE.dic",           "sw_KE.aff",
+    "tet_ID.zip",                "tet_ID.dic",          "tet_ID.aff",
+    "th_TH.zip",                 "th_TH.dic",           "th_TH.aff",
+    "tl_PH.zip",                 "tl_PH.dic",           "tl_PH.aff",
+    "tn_ZA.zip",                 "tn_ZA.dic",           "tn_ZA.aff",
+    "ts_ZA.zip",                 "ts_ZA.dic",           "ts_ZA.aff",
+    "uk_UA.zip",                 "uk_UA.dic",           "uk_UA.aff",
+    "ve_ZA.zip",                 "ve_ZA.dic",           "ve_ZA.aff",
+    "vi_VN.zip",                 "vi_VN.dic",           "vi_VN.aff",
+    "xh_ZA.zip",                 "xh_ZA.dic",           "xh_ZA.aff",
+    "zu_ZA.zip",                 "zu_ZA.dic",           "zu_ZA.aff",
+  };
+  
+  public void test() throws Exception {
+    for (int i = 0; i < tests.length; i += 3) {
+      File f = new File(DICTIONARY_HOME, tests[i]);
+      assert f.exists();
+      
+      try (ZipFile zip = new ZipFile(f, IOUtils.CHARSET_UTF_8)) {
+        ZipEntry dicEntry = zip.getEntry(tests[i+1]);
+        assert dicEntry != null;
+        ZipEntry affEntry = zip.getEntry(tests[i+2]);
+        assert affEntry != null;
+        
+        // get ram from previous impl
+        String oldRAM = "FAIL";
+        try (InputStream dictionary = zip.getInputStream(dicEntry);
+            InputStream affix = zip.getInputStream(affEntry)) {
+          try {
+            HunspellDictionary dic = new HunspellDictionary(affix, dictionary, TEST_VERSION_CURRENT);
+            oldRAM = RamUsageEstimator.humanSizeOf(dic);
+          } catch (Throwable t) {}
+       }
+      
+        try (InputStream dictionary = zip.getInputStream(dicEntry);
+             InputStream affix = zip.getInputStream(affEntry)) {
+          Dictionary dic = new Dictionary(affix, dictionary);
+          System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic));
+        }
+      }
+    }
+  }
+  
+  public void testOneDictionary() throws Exception {
+    String toTest = "hu_HU.zip";
+    for (int i = 0; i < tests.length; i++) {
+      if (tests[i].equals(toTest)) {
+        File f = new File(DICTIONARY_HOME, tests[i]);
+        assert f.exists();
+        
+        try (ZipFile zip = new ZipFile(f, IOUtils.CHARSET_UTF_8)) {
+          ZipEntry dicEntry = zip.getEntry(tests[i+1]);
+          assert dicEntry != null;
+          ZipEntry affEntry = zip.getEntry(tests[i+2]);
+          assert affEntry != null;
+        
+          try (InputStream dictionary = zip.getInputStream(dicEntry);
+               InputStream affix = zip.getInputStream(affEntry)) {
+              Dictionary dic = new Dictionary(affix, dictionary);
+          }
+        }
+      }
+    }    
+  }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java
new file mode 100644
index 00000000000..14c6e8967d0
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java
@@ -0,0 +1,109 @@
+package org.apache.lucene.analysis.hunspell2;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.FilterInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.ParseException;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestDictionary extends LuceneTestCase {
+
+  public void testSimpleDictionary() throws Exception {
+    InputStream affixStream = getClass().getResourceAsStream("simple.aff");
+    InputStream dictStream = getClass().getResourceAsStream("simple.dic");
+
+    Dictionary dictionary = new Dictionary(affixStream, dictStream);
+    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
+    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
+    char flags[] = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef());
+    assertNotNull(flags);
+    assertEquals(1, flags.length);
+    assertEquals("Wrong number of flags for lucen", 1, dictionary.lookupWord(new char[]{'l', 'u', 'c', 'e', 'n'}, 0, 5, new BytesRef()).length);
+
+    affixStream.close();
+    dictStream.close();
+  }
+
+  public void testCompressedDictionary() throws Exception {
+    InputStream affixStream = getClass().getResourceAsStream("compressed.aff");
+    InputStream dictStream = getClass().getResourceAsStream("compressed.dic");
+
+    Dictionary dictionary = new Dictionary(affixStream, dictStream);
+    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
+    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
+    assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef()).length);
+    
+    affixStream.close();
+    dictStream.close();
+  }
+
+  // malformed rule causes ParseException
+  public void testInvalidData() throws Exception {
+    InputStream affixStream = getClass().getResourceAsStream("broken.aff");
+    InputStream dictStream = getClass().getResourceAsStream("simple.dic");
+    
+    try {
+      new Dictionary(affixStream, dictStream);
+      fail("didn't get expected exception");
+    } catch (ParseException expected) {
+      assertEquals("The affix file contains a rule with less than five elements", expected.getMessage());
+      assertEquals(23, expected.getErrorOffset());
+    }
+    
+    affixStream.close();
+    dictStream.close();
+  }
+  
+  private class CloseCheckInputStream extends FilterInputStream {
+    private boolean closed = false;
+
+    public CloseCheckInputStream(InputStream delegate) {
+      super(delegate);
+    }
+
+    @Override
+    public void close() throws IOException {
+      this.closed = true;
+      super.close();
+    }
+    
+    public boolean isClosed() {
+      return this.closed;
+    }
+  }
+  
+  public void testResourceCleanup() throws Exception {
+    CloseCheckInputStream affixStream = new CloseCheckInputStream(getClass().getResourceAsStream("compressed.aff"));
+    CloseCheckInputStream dictStream = new CloseCheckInputStream(getClass().getResourceAsStream("compressed.dic"));
+    
+    new Dictionary(affixStream, dictStream);
+    
+    assertFalse(affixStream.isClosed());
+    assertFalse(dictStream.isClosed());
+    
+    affixStream.close();
+    dictStream.close();
+    
+    assertTrue(affixStream.isClosed());
+    assertTrue(dictStream.isClosed());
+  }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java
new file mode 100644
index 00000000000..eafb1f272cf
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java
@@ -0,0 +1,87 @@
+package org.apache.lucene.analysis.hunspell2;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.TestUtil;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+
+public class TestHunspell2StemFilter extends BaseTokenStreamTestCase {
+  private static Dictionary dictionary;
+  
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    try (InputStream affixStream = TestStemmer.class.getResourceAsStream("simple.aff");
+         InputStream dictStream = TestStemmer.class.getResourceAsStream("simple.dic")) {
+      dictionary = new Dictionary(affixStream, dictStream);
+    }
+  }
+  
+  @AfterClass
+  public static void afterClass() {
+    dictionary = null;
+  }
+  
+  /** Simple test for KeywordAttribute */
+  public void testKeywordAttribute() throws IOException {
+    MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome");
+    tokenizer.setEnableChecks(true);
+    Hunspell2StemFilter filter = new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3));
+    assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1});
+    
+    // assert with keyword marker
+    tokenizer = whitespaceMockTokenizer("lucene is awesome");
+    CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true);
+    filter = new Hunspell2StemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary, TestUtil.nextInt(random(), 1, 3));
+    assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
+      }  
+    };
+    checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
+  }
+  
+  public void testEmptyTerm() throws IOException {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new KeywordTokenizer();
+        return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
+      }
+    };
+    checkOneTerm(a, "", "");
+  }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java
new file mode 100644
index 00000000000..d95e2be04b6
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java
@@ -0,0 +1,50 @@
+package org.apache.lucene.analysis.hunspell2;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+/**
+ * Simple tests to ensure the Hunspell stemmer loads from factory
+ */
+public class TestHunspell2StemFilterFactory extends BaseTokenStreamFactoryTestCase {
+  public void testStemming() throws Exception {
+    Reader reader = new StringReader("abc");
+    TokenStream stream = whitespaceMockTokenizer(reader);
+    stream = tokenFilterFactory("Hunspell2Stem",
+        "dictionary", "simple.dic",
+        "affix", "simple.aff").create(stream);
+    assertTokenStreamContents(stream, new String[] { "ab" });
+  }
+  
+  /** Test that bogus arguments result in exception */
+  public void testBogusArguments() throws Exception {
+    try {
+      tokenFilterFactory("Hunspell2Stem",
+          "dictionary", "simple.dic",
+          "bogusArg", "bogusValue");
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("Unknown parameters"));
+    }
+  }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java
new file mode 100644
index 00000000000..ea98f65256f
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java
@@ -0,0 +1,105 @@
+package org.apache.lucene.analysis.hunspell2;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.hunspell2.Stemmer.Stem;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
+public class TestStemmer extends LuceneTestCase {
+  private static Stemmer stemmer;
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    try (InputStream affixStream = TestStemmer.class.getResourceAsStream("simple.aff");
+        InputStream dictStream = TestStemmer.class.getResourceAsStream("simple.dic")) {
+     Dictionary dictionary = new Dictionary(affixStream, dictStream);
+     stemmer = new Stemmer(dictionary);
+   }
+  }
+  
+  @AfterClass
+  public static void afterClass() {
+    stemmer = null;
+  }
+
+  public void testSimpleSuffix() {
+    assertStemsTo("lucene", "lucene", "lucen");
+    assertStemsTo("mahoute", "mahout");
+  }
+
+  public void testSimplePrefix() {
+    assertStemsTo("solr", "olr");
+  }
+
+  public void testRecursiveSuffix() {
+    assertStemsTo("abcd", "ab");
+  }
+
+  // all forms unmunched from dictionary
+  public void testAllStems() {
+    assertStemsTo("ab", "ab");
+    assertStemsTo("abc", "ab");
+    assertStemsTo("apach", "apach");
+    assertStemsTo("apache", "apach");
+    assertStemsTo("foo", "foo");
+    assertStemsTo("food", "foo");
+    assertStemsTo("foos", "foo");
+    assertStemsTo("lucen", "lucen");
+    assertStemsTo("lucene", "lucen", "lucene");
+    assertStemsTo("mahout", "mahout");
+    assertStemsTo("mahoute", "mahout");
+    assertStemsTo("moo", "moo");
+    assertStemsTo("mood", "moo");
+    assertStemsTo("olr", "olr");
+    assertStemsTo("solr", "olr");
+  }
+  
+  // some bogus stuff that should not stem (empty lists)!
+  public void testBogusStems() {    
+    assertStemsTo("abs");
+    assertStemsTo("abe");
+    assertStemsTo("sab");
+    assertStemsTo("sapach");
+    assertStemsTo("sapache");
+    assertStemsTo("apachee");
+    assertStemsTo("sfoo");
+    assertStemsTo("sfoos");
+    assertStemsTo("fooss");
+    assertStemsTo("lucenee");
+    assertStemsTo("solre");
+  }
+  
+  private void assertStemsTo(String s, String... expected) {
+    Arrays.sort(expected);
+    
+    List<Stem> stems = stemmer.stem(s);
+    String actual[] = new String[stems.size()];
+    for (int i = 0; i < actual.length; i++) {
+      actual[i] = stems.get(i).getStemString();
+    }
+    Arrays.sort(actual);
+    
+    assertArrayEquals(expected, actual);
+  }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff
new file mode 100644
index 00000000000..3b780cd1d7b
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff
@@ -0,0 +1,24 @@
+SET UTF-8
+TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
+
+SFX A Y 3
+SFX A   0     e         n
+SFX A   0     e         t
+SFX A   0     e         h
+
+SFX C Y 2
+SFX C   0     d/C       c
+SFX C   0     c         b
+
+SFX D Y 1
+SFX D   0     s         o
+
+SFX E Y 1
+SFX E   0     d         o
+
+PFX B Y 1
+PFX B   0     s         o
+
+#wrong rule (only 4 elements)
+PFX A0 Y 1
+PFX A0 0 a
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff
new file mode 100644
index 00000000000..e4a1b37300f
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff
@@ -0,0 +1,29 @@
+SET UTF-8
+TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
+
+FLAG long
+
+AF 5
+AF AA
+AF BB
+AF CC
+AF DD
+AF EE
+
+SFX AA Y 3
+SFX AA   0     e         n
+SFX AA   0     e         t
+SFX AA   0     e         h
+
+SFX CC Y 2
+SFX CC   0     d/3       c
+SFX CC   0     c         b
+
+SFX DD Y 1
+SFX DD   0     s         o
+
+SFX EE Y 1
+SFX EE   0     d         o
+
+PFX BB Y 1
+PFX BB   0     s         o
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic
new file mode 100644
index 00000000000..dd3890fae31
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic
@@ -0,0 +1,9 @@
+6
+ab/3
+apach/1
+foo/4
+foo/5
+lucen/1
+lucene
+mahout/1
+olr/2
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff
new file mode 100644
index 00000000000..db9423dcad1
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff
@@ -0,0 +1,20 @@
+SET UTF-8
+TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
+
+SFX A Y 3
+SFX A   0     e         n
+SFX A   0     e         t
+SFX A   0     e         h
+
+SFX C Y 2
+SFX C   0     d/C       c
+SFX C   0     c         b
+
+SFX D Y 1
+SFX D   0     s         o
+
+SFX E Y 1
+SFX E   0     d         o
+
+PFX B Y 1
+PFX B   0     s         o
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic
new file mode 100644
index 00000000000..f7bbab3ba67
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic
@@ -0,0 +1,10 @@
+9
+ab/C
+apach/A
+foo/D
+foo/E
+lucen/A
+lucene
+mahout/A
+moo/E
+olr/B

From e541984b62bdc3acc5947f761d40371c75b38de3 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Mon, 24 Feb 2014 06:04:28 +0000
Subject: [PATCH 03/17] remove treemap (TODO: refactor the sorter and use that)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571154 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene/analysis/hunspell2/Dictionary.java | 90 +++++++++++++------
 1 file changed, 62 insertions(+), 28 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
index a7b9a58f080..c8068aa41b9 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
@@ -35,11 +35,11 @@ import java.text.ParseException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.Comparator;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
-import java.util.TreeMap;
 
 /**
  * In-memory structure for the dictionary (.dic) and affix (.aff)
@@ -93,16 +93,10 @@ public class Dictionary {
     buffered.reset();
     CharsetDecoder decoder = getJavaEncoding(encoding);
     readAffixFile(buffered, decoder);
-    TreeMap<BytesRef,Integer> tempWords = new TreeMap<BytesRef,Integer>();
     flagLookup.add(new BytesRef()); // no flags -> ord 0
-    readDictionaryFile(dictionary, decoder, tempWords);
     PositiveIntOutputs o = PositiveIntOutputs.getSingleton();
-    Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE4, o); // nocommit: byte4
-    IntsRef scratchInts = new IntsRef();
-    for (Map.Entry<BytesRef,Integer> e : tempWords.entrySet()) {
-      UnicodeUtil.UTF8toUTF32(e.getKey(), scratchInts);
-      b.add(scratchInts, e.getValue().longValue());
-    }
+    Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE4, o);
+    readDictionaryFile(dictionary, decoder, b);
     words = b.finish();
   }
 
@@ -366,20 +360,51 @@ public class Dictionary {
    * @param decoder CharsetDecoder used to decode the contents of the file
    * @throws IOException Can be thrown while reading from the file
    */
-  private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder, TreeMap<BytesRef,Integer> words) throws IOException {
+  private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder, Builder<Long> words) throws IOException {
     BytesRef flagsScratch = new BytesRef();
-    BytesRef flagsScratch2 = new BytesRef();
+    IntsRef scratchInts = new IntsRef();
     
     BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder));
     // TODO: don't create millions of strings.
-    String line = reader.readLine(); // first line is number of entries
+    String line = reader.readLine(); // first line is number of entries (approximately, sometimes)
     // sometimes the number of entries has a comment/copyright after it
     line = line.replaceFirst("\\s*\\#.*$", "");
     int numEntries = Integer.parseInt(line);
     
+    String lines[] = new String[numEntries];
+    int upto = 0;
+    while ((line = reader.readLine()) != null) {
+      if (upto == lines.length) {
+        lines = Arrays.copyOf(lines, (int)(lines.length * 1.25));
+      }
+      lines[upto++] = line;
+    }
+    
+    // TODO: just replace this with offline sorter?
+    Arrays.sort(lines, 0, upto, new Comparator<String>() {
+      @Override
+      public int compare(String o1, String o2) {
+        int sep1 = o1.lastIndexOf('/');
+        if (sep1 >= 0) {
+          o1 = o1.substring(0, sep1);
+        }
+        
+        int sep2 = o2.lastIndexOf('/');
+        if (sep2 >= 0) {
+          o2 = o2.substring(0, sep2);
+        }
+        return o1.compareTo(o2);
+      }
+    });
+    
     // TODO: the flags themselves can be double-chars (long) or also numeric
     // either way the trick is to encode them as char... but they must be parsed differently
-    while ((line = reader.readLine()) != null) {
+    
+    BytesRef currentEntry = new BytesRef();
+    char currentFlags[] = new char[0];
+    
+    for (int i = 0; i < upto; i++) {
+      line = lines[i];
       String entry;
       char wordForm[];
       
@@ -405,24 +430,33 @@ public class Dictionary {
       }
 
       BytesRef scratch = new BytesRef(entry);
-      Integer existingOrd = words.get(scratch);
-      final char mergedEntries[];
-      if (existingOrd == null || existingOrd == 0) {
-        mergedEntries = wordForm;
+      int cmp = scratch.compareTo(currentEntry);
+      if (cmp < 0) {
+        throw new IllegalArgumentException("out of order: " + scratch.utf8ToString() + " < " + currentEntry.utf8ToString());
+      } else if (cmp == 0) {
+        currentFlags = merge(currentFlags, wordForm);
       } else {
-        flagLookup.get(existingOrd, flagsScratch2);
-        mergedEntries = merge(decodeFlags(flagsScratch2), wordForm);
+        final int hashCode = encodeFlagsWithHash(flagsScratch, currentFlags);
+        int ord = flagLookup.add(flagsScratch, hashCode);
+        if (ord < 0) {
+          // already exists in our hash
+          ord = (-ord)-1;
+        }
+        UnicodeUtil.UTF8toUTF32(currentEntry, scratchInts);
+        words.add(scratchInts, (long)ord);
+        currentEntry = scratch;
+        currentFlags = wordForm;
       }
-
-      final int hashCode = encodeFlagsWithHash(flagsScratch, mergedEntries);
-      int ord = flagLookup.add(flagsScratch, hashCode);
-      if (ord < 0) {
-        // already exists in our hash
-        ord = (-ord)-1;
-      }
-      
-      words.put(scratch, ord);
     }
+    
+    final int hashCode = encodeFlagsWithHash(flagsScratch, currentFlags);
+    int ord = flagLookup.add(flagsScratch, hashCode);
+    if (ord < 0) {
+      // already exists in our hash
+      ord = (-ord)-1;
+    }
+    UnicodeUtil.UTF8toUTF32(currentEntry, scratchInts);
+    words.add(scratchInts, (long)ord);
   }
   
   static char[] decodeFlags(BytesRef b) {

From ad20d99b3571181504d956a5056c449de5968afd Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Mon, 24 Feb 2014 14:53:21 +0000
Subject: [PATCH 04/17] break out this class

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571305 13f79535-47bb-0310-9956-ffa450edef68
---
 .../hunspell2/Hunspell2StemFilter.java        |  1 -
 .../lucene/analysis/hunspell2/Stem.java       | 98 +++++++++++++++++++
 .../lucene/analysis/hunspell2/Stemmer.java    | 78 ---------------
 .../hunspell2/TestAllDictionaries.java        |  3 +-
 .../analysis/hunspell2/TestStemmer.java       |  1 -
 5 files changed, 100 insertions(+), 81 deletions(-)
 create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stem.java

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java
index f9dfb770ab2..45941345342 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java
@@ -22,7 +22,6 @@ import java.util.List;
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.hunspell2.Stemmer.Stem;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stem.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stem.java
new file mode 100644
index 00000000000..d3c8d4c86ab
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stem.java
@@ -0,0 +1,98 @@
+package org.apache.lucene.analysis.hunspell2;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Stem represents all information known about a stem of a word.  This includes the stem, and the prefixes and suffixes
+ * that were used to change the word into the stem.
+ */
+final class Stem {
+  final List<Affix> prefixes = new ArrayList<Affix>();
+  final List<Affix> suffixes = new ArrayList<Affix>();
+  final char stem[];
+  final int stemLength;
+
+  /**
+   * Creates a new Stem wrapping the given word stem
+   *
+   * @param stem Stem of a word
+   */
+  public Stem(char stem[], int stemLength) {
+    this.stem = stem;
+    this.stemLength = stemLength;
+  }
+
+  /**
+   * Adds a prefix to the list of prefixes used to generate this stem.  Because it is assumed that prefixes are added
+   * depth first, the prefix is added to the front of the list
+   *
+   * @param prefix Prefix to add to the list of prefixes for this stem
+   */
+  public void addPrefix(Affix prefix) {
+    prefixes.add(0, prefix);
+  }
+
+  /**
+   * Adds a suffix to the list of suffixes used to generate this stem.  Because it is assumed that suffixes are added
+   * depth first, the suffix is added to the end of the list
+   *
+   * @param suffix Suffix to add to the list of suffixes for this stem
+   */
+  public void addSuffix(Affix suffix) {
+    suffixes.add(suffix);
+  }
+
+  /**
+   * Returns the list of prefixes used to generate the stem
+   *
+   * @return List of prefixes used to generate the stem or an empty list if no prefixes were required
+   */
+  public List<Affix> getPrefixes() {
+    return prefixes;
+  }
+
+  /**
+   * Returns the list of suffixes used to generate the stem
+   * 
+   * @return List of suffixes used to generate the stem or an empty list if no suffixes were required
+   */
+  public List<Affix> getSuffixes() {
+    return suffixes;
+  }
+
+  /**
+   * Returns the text of the word's stem.
+   * @see #getStemLength()
+   */
+  public char[] getStem() {
+    return stem;
+  }
+
+  /** Returns the valid length of the text in {@link #getStem()} */
+  public int getStemLength() {
+    return stemLength;
+  }
+  
+  /** Only use this if you really need a string (e.g. for testing) */
+  public String getStemString() {
+    return new String(stem, 0, stemLength);
+  }
+}
\ No newline at end of file
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
index 7d36c81e4ae..aa00836d6fe 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
@@ -207,82 +207,4 @@ final class Stemmer {
   private boolean hasCrossCheckedFlag(char flag, char[] flags) {
     return flags == null || Arrays.binarySearch(flags, flag) >= 0;
   }
-
-  /**
-   * Stem represents all information known about a stem of a word.  This includes the stem, and the prefixes and suffixes
-   * that were used to change the word into the stem.
-   */
-  public static class Stem {
-
-    private final List<Affix> prefixes = new ArrayList<Affix>();
-    private final List<Affix> suffixes = new ArrayList<Affix>();
-    private final char stem[];
-    private final int stemLength;
-
-    /**
-     * Creates a new Stem wrapping the given word stem
-     *
-     * @param stem Stem of a word
-     */
-    public Stem(char stem[], int stemLength) {
-      this.stem = stem;
-      this.stemLength = stemLength;
-    }
-
-    /**
-     * Adds a prefix to the list of prefixes used to generate this stem.  Because it is assumed that prefixes are added
-     * depth first, the prefix is added to the front of the list
-     *
-     * @param prefix Prefix to add to the list of prefixes for this stem
-     */
-    public void addPrefix(Affix prefix) {
-      prefixes.add(0, prefix);
-    }
-
-    /**
-     * Adds a suffix to the list of suffixes used to generate this stem.  Because it is assumed that suffixes are added
-     * depth first, the suffix is added to the end of the list
-     *
-     * @param suffix Suffix to add to the list of suffixes for this stem
-     */
-    public void addSuffix(Affix suffix) {
-      suffixes.add(suffix);
-    }
-
-    /**
-     * Returns the list of prefixes used to generate the stem
-     *
-     * @return List of prefixes used to generate the stem or an empty list if no prefixes were required
-     */
-    public List<Affix> getPrefixes() {
-      return prefixes;
-    }
-
-    /**
-     * Returns the list of suffixes used to generate the stem
-     * 
-     * @return List of suffixes used to generate the stem or an empty list if no suffixes were required
-     */
-    public List<Affix> getSuffixes() {
-      return suffixes;
-    }
-
-    /**
-     * Returns the text of the word's stem.
-     * @see #getStemLength()
-     */
-    public char[] getStem() {
-      return stem;
-    }
-
-    /** Returns the valid length of the text in {@link #getStem()} */
-    public int getStemLength() {
-      return stemLength;
-    }
-    
-    /** Only use this if you really need a string (e.g. for testing) */
-    public String getStemString() {
-      return new String(stem, 0, stemLength);
-    }
-  }
 }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
index 02ccedb9be7..ecb21b97a7c 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
@@ -26,13 +26,14 @@ import org.apache.lucene.analysis.hunspell.HunspellDictionary;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.RamUsageEstimator;
+import org.junit.Ignore;
 
 /**
  * Can be retrieved via:
  * wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/
  * Note some of the files differ only in case. This may be a problem on your operating system!
  */
-//@Ignore("enable manually")
+@Ignore("enable manually")
 public class TestAllDictionaries extends LuceneTestCase {
   
   // set this to the location of where you downloaded all the files
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java
index ea98f65256f..a8ac2a83fa9 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java
@@ -17,7 +17,6 @@ package org.apache.lucene.analysis.hunspell2;
  * limitations under the License.
  */
 
-import org.apache.lucene.analysis.hunspell2.Stemmer.Stem;
 import org.apache.lucene.util.LuceneTestCase;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;

From 7f6a40e15782d97dfedddcf6cc2b42f9c811654d Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Mon, 24 Feb 2014 15:45:07 +0000
Subject: [PATCH 05/17] LUCENE-5468: factor OfflineSorter out of suggest

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571321 13f79535-47bb-0310-9956-ffa450edef68
---
 .../apache/lucene/util}/BytesRefArray.java    |  2 +-
 .../apache/lucene/util/OfflineSorter.java}    | 33 ++++++----
 .../lucene/util}/TestBytesRefArray.java       |  2 +-
 .../lucene/util/TestOfflineSorter.java}       | 60 ++++++++++---------
 .../search/suggest/BufferedInputIterator.java |  1 +
 .../lucene/search/suggest/InMemorySorter.java |  1 +
 .../search/suggest/SortedInputIterator.java   | 15 ++---
 .../suggest/analyzing/AnalyzingSuggester.java | 12 ++--
 .../suggest/analyzing/FreeTextSuggester.java  |  5 +-
 .../search/suggest/fst/ExternalRefSorter.java | 25 ++++----
 .../suggest/fst/FSTCompletionLookup.java      | 29 ++++-----
 .../suggest/fst/WFSTCompletionLookup.java     |  2 +-
 .../suggest/fst/BytesRefSortersTest.java      |  4 +-
 .../search/suggest/fst/LargeInputFST.java     | 11 +++-
 14 files changed, 114 insertions(+), 88 deletions(-)
 rename lucene/{suggest/src/java/org/apache/lucene/search/suggest => core/src/java/org/apache/lucene/util}/BytesRefArray.java (99%)
 rename lucene/{suggest/src/java/org/apache/lucene/search/suggest/Sort.java => core/src/java/org/apache/lucene/util/OfflineSorter.java} (95%)
 rename lucene/{suggest/src/test/org/apache/lucene/search/suggest => core/src/test/org/apache/lucene/util}/TestBytesRefArray.java (98%)
 rename lucene/{suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java => core/src/test/org/apache/lucene/util/TestOfflineSorter.java} (72%)

diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/BytesRefArray.java b/lucene/core/src/java/org/apache/lucene/util/BytesRefArray.java
similarity index 99%
rename from lucene/suggest/src/java/org/apache/lucene/search/suggest/BytesRefArray.java
rename to lucene/core/src/java/org/apache/lucene/util/BytesRefArray.java
index e7a44fc37e0..eb0aa1a808e 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/BytesRefArray.java
+++ b/lucene/core/src/java/org/apache/lucene/util/BytesRefArray.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.search.suggest;
+package org.apache.lucene.util;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/Sort.java b/lucene/core/src/java/org/apache/lucene/util/OfflineSorter.java
similarity index 95%
rename from lucene/suggest/src/java/org/apache/lucene/search/suggest/Sort.java
rename to lucene/core/src/java/org/apache/lucene/util/OfflineSorter.java
index 8c6c20f1444..76781f8b8c7 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/Sort.java
+++ b/lucene/core/src/java/org/apache/lucene/util/OfflineSorter.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.search.suggest;
+package org.apache.lucene.util;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -17,11 +17,24 @@ package org.apache.lucene.search.suggest;
  * limitations under the License.
  */
 
-import java.io.*;
-import java.util.*;
-
-import org.apache.lucene.util.*;
-import org.apache.lucene.util.PriorityQueue;
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.Closeable;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.EOFException;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Locale;
 
 /**
  * On-disk sorting of byte arrays. Each byte array (entry) is a composed of the following
@@ -35,7 +48,7 @@ import org.apache.lucene.util.PriorityQueue;
  * @lucene.experimental
  * @lucene.internal
  */
-public final class Sort {
+public final class OfflineSorter {
   /** Convenience constant for megabytes */
   public final static long MB = 1024 * 1024;
   /** Convenience constant for gigabytes */
@@ -170,7 +183,7 @@ public final class Sort {
    * @see #defaultTempDir()
    * @see BufferSize#automatic()
    */
-  public Sort() throws IOException {
+  public OfflineSorter() throws IOException {
     this(DEFAULT_COMPARATOR, BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES);
   }
   
@@ -180,14 +193,14 @@ public final class Sort {
    * @see #defaultTempDir()
    * @see BufferSize#automatic()
    */
-  public Sort(Comparator<BytesRef> comparator) throws IOException {
+  public OfflineSorter(Comparator<BytesRef> comparator) throws IOException {
     this(comparator, BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES);
   }
 
   /**
    * All-details constructor.
    */
-  public Sort(Comparator<BytesRef> comparator, BufferSize ramBufferSize, File tempDirectory, int maxTempfiles) {
+  public OfflineSorter(Comparator<BytesRef> comparator, BufferSize ramBufferSize, File tempDirectory, int maxTempfiles) {
     if (ramBufferSize.bytes < ABSOLUTE_MIN_SORT_BUFFER_SIZE) {
       throw new IllegalArgumentException(MIN_BUFFER_SIZE_MSG + ": " + ramBufferSize.bytes);
     }
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestBytesRefArray.java b/lucene/core/src/test/org/apache/lucene/util/TestBytesRefArray.java
similarity index 98%
rename from lucene/suggest/src/test/org/apache/lucene/search/suggest/TestBytesRefArray.java
rename to lucene/core/src/test/org/apache/lucene/util/TestBytesRefArray.java
index 935b71bc529..9fcd6a1b6df 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestBytesRefArray.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestBytesRefArray.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.search.suggest;
+package org.apache.lucene.util;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java b/lucene/core/src/test/org/apache/lucene/util/TestOfflineSorter.java
similarity index 72%
rename from lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java
rename to lucene/core/src/test/org/apache/lucene/util/TestOfflineSorter.java
index 540fadedf11..b7f14d02ffb 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestOfflineSorter.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.search.suggest.fst;
+package org.apache.lucene.util;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -17,70 +17,72 @@ package org.apache.lucene.search.suggest.fst;
  * limitations under the License.
  */
 
-import java.io.*;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Comparator;
 
-import org.apache.lucene.search.suggest.Sort;
-import org.apache.lucene.search.suggest.Sort.BufferSize;
-import org.apache.lucene.search.suggest.Sort.ByteSequencesWriter;
-import org.apache.lucene.search.suggest.Sort.SortInfo;
-import org.apache.lucene.util.*;
-import org.junit.*;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.OfflineSorter;
+import org.apache.lucene.util.OfflineSorter.BufferSize;
+import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
+import org.apache.lucene.util.OfflineSorter.SortInfo;
+import org.apache.lucene.util.TestUtil;
 
 /**
  * Tests for on-disk merge sorting.
  */
-public class TestSort extends LuceneTestCase {
+public class TestOfflineSorter extends LuceneTestCase {
   private File tempDir;
 
-  @Before
-  public void prepareTempDir() throws IOException {
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
     tempDir = TestUtil.getTempDir("mergesort");
     TestUtil.rmDir(tempDir);
     tempDir.mkdirs();
   }
   
-  @After
-  public void cleanup() throws IOException {
+  @Override
+  public void tearDown() throws Exception {
     if (tempDir != null)
       TestUtil.rmDir(tempDir);
+    super.tearDown();
   }
 
-  @Test
   public void testEmpty() throws Exception {
-    checkSort(new Sort(), new byte [][] {});
+    checkSort(new OfflineSorter(), new byte [][] {});
   }
 
-  @Test
   public void testSingleLine() throws Exception {
-    checkSort(new Sort(), new byte [][] {
+    checkSort(new OfflineSorter(), new byte [][] {
         "Single line only.".getBytes("UTF-8")
     });
   }
 
-  @Test
   public void testIntermediateMerges() throws Exception {
     // Sort 20 mb worth of data with 1mb buffer, binary merging.
-    SortInfo info = checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(1), Sort.defaultTempDir(), 2), 
-        generateRandom((int)Sort.MB * 20));
+    SortInfo info = checkSort(new OfflineSorter(OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(1), OfflineSorter.defaultTempDir(), 2), 
+        generateRandom((int)OfflineSorter.MB * 20));
     assertTrue(info.mergeRounds > 10);
   }
 
-  @Test
   public void testSmallRandom() throws Exception {
     // Sort 20 mb worth of data with 1mb buffer.
-    SortInfo sortInfo = checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(1), Sort.defaultTempDir(), Sort.MAX_TEMPFILES), 
-        generateRandom((int)Sort.MB * 20));
+    SortInfo sortInfo = checkSort(new OfflineSorter(OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(1), OfflineSorter.defaultTempDir(), OfflineSorter.MAX_TEMPFILES), 
+        generateRandom((int)OfflineSorter.MB * 20));
     assertEquals(1, sortInfo.mergeRounds);
   }
 
-  @Test @Nightly
+  @Nightly
   public void testLargerRandom() throws Exception {
     // Sort 100MB worth of data with 15mb buffer.
-    checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(16), Sort.defaultTempDir(), Sort.MAX_TEMPFILES), 
-        generateRandom((int)Sort.MB * 100));
+    checkSort(new OfflineSorter(OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(16), OfflineSorter.defaultTempDir(), OfflineSorter.MAX_TEMPFILES), 
+        generateRandom((int)OfflineSorter.MB * 100));
   }
 
   private byte[][] generateRandom(int howMuchData) {
@@ -108,9 +110,9 @@ public class TestSort extends LuceneTestCase {
     }
   };
   /**
-   * Check sorting data on an instance of {@link Sort}.
+   * Check sorting data on an instance of {@link OfflineSorter}.
    */
-  private SortInfo checkSort(Sort sort, byte[][] data) throws IOException {
+  private SortInfo checkSort(OfflineSorter sort, byte[][] data) throws IOException {
     File unsorted = writeAll("unsorted", data);
 
     Arrays.sort(data, unsignedByteOrderComparator);
@@ -147,7 +149,7 @@ public class TestSort extends LuceneTestCase {
 
   private File writeAll(String name, byte[][] data) throws IOException {
     File file = new File(tempDir, name);
-    ByteSequencesWriter w = new Sort.ByteSequencesWriter(file);
+    ByteSequencesWriter w = new OfflineSorter.ByteSequencesWriter(file);
     for (byte [] datum : data) {
       w.write(datum);
     }
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/BufferedInputIterator.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/BufferedInputIterator.java
index b9772fafebd..96c7cf85f60 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/BufferedInputIterator.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/BufferedInputIterator.java
@@ -21,6 +21,7 @@ import java.io.IOException;
 
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefArray;
 import org.apache.lucene.util.Counter;
 
 /**
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/InMemorySorter.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/InMemorySorter.java
index 0efc3a5fa7a..42e19a8f9b9 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/InMemorySorter.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/InMemorySorter.java
@@ -21,6 +21,7 @@ import java.util.Comparator;
 
 import org.apache.lucene.search.suggest.fst.BytesRefSorter;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefArray;
 import org.apache.lucene.util.BytesRefIterator;
 import org.apache.lucene.util.Counter;
 
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedInputIterator.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedInputIterator.java
index d804f38e1b1..d7011d435d9 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedInputIterator.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedInputIterator.java
@@ -21,13 +21,14 @@ import java.io.File;
 import java.io.IOException;
 import java.util.Comparator;
 
-import org.apache.lucene.search.suggest.Sort.ByteSequencesReader;
-import org.apache.lucene.search.suggest.Sort.ByteSequencesWriter;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.ByteArrayDataOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.OfflineSorter;
+import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
+import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
 
 /**
  * This wrapper buffers incoming elements and makes sure they are sorted based on given comparator.
@@ -141,13 +142,13 @@ public class SortedInputIterator implements InputIterator {
     }
   };
   
-  private Sort.ByteSequencesReader sort() throws IOException {
+  private ByteSequencesReader sort() throws IOException {
     String prefix = getClass().getSimpleName();
-    File directory = Sort.defaultTempDir();
+    File directory = OfflineSorter.defaultTempDir();
     tempInput = File.createTempFile(prefix, ".input", directory);
     tempSorted = File.createTempFile(prefix, ".sorted", directory);
     
-    final Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
+    final OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
     boolean success = false;
     try {
       BytesRef spare;
@@ -158,8 +159,8 @@ public class SortedInputIterator implements InputIterator {
         encode(writer, output, buffer, spare, source.payload(), source.weight());
       }
       writer.close();
-      new Sort(tieBreakByCostComparator).sort(tempInput, tempSorted);
-      ByteSequencesReader reader = new Sort.ByteSequencesReader(tempSorted);
+      new OfflineSorter(tieBreakByCostComparator).sort(tempInput, tempSorted);
+      ByteSequencesReader reader = new OfflineSorter.ByteSequencesReader(tempSorted);
       success = true;
       return reader;
       
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
index 6b2c1f6bbe1..5dad351a83f 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
@@ -31,7 +31,6 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.TokenStreamToAutomaton;
 import org.apache.lucene.search.suggest.InputIterator;
 import org.apache.lucene.search.suggest.Lookup;
-import org.apache.lucene.search.suggest.Sort;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.ByteArrayDataOutput;
 import org.apache.lucene.store.DataInput;
@@ -56,6 +55,7 @@ import org.apache.lucene.util.fst.PairOutputs;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
 import org.apache.lucene.util.fst.Util.MinResult;
 import org.apache.lucene.util.fst.Util;
+import org.apache.lucene.util.OfflineSorter;
 
 /**
  * Suggester that first analyzes the surface form, adds the
@@ -380,14 +380,14 @@ public class AnalyzingSuggester extends Lookup {
   @Override
   public void build(InputIterator iterator) throws IOException {
     String prefix = getClass().getSimpleName();
-    File directory = Sort.defaultTempDir();
+    File directory = OfflineSorter.defaultTempDir();
     File tempInput = File.createTempFile(prefix, ".input", directory);
     File tempSorted = File.createTempFile(prefix, ".sorted", directory);
 
     hasPayloads = iterator.hasPayloads();
 
-    Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
-    Sort.ByteSequencesReader reader = null;
+    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
+    OfflineSorter.ByteSequencesReader reader = null;
     BytesRef scratch = new BytesRef();
 
     TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
@@ -463,12 +463,12 @@ public class AnalyzingSuggester extends Lookup {
       writer.close();
 
       // Sort all input/output pairs (required by FST.Builder):
-      new Sort(new AnalyzingComparator(hasPayloads)).sort(tempInput, tempSorted);
+      new OfflineSorter(new AnalyzingComparator(hasPayloads)).sort(tempInput, tempSorted);
 
       // Free disk space:
       tempInput.delete();
 
-      reader = new Sort.ByteSequencesReader(tempSorted);
+      reader = new OfflineSorter.ByteSequencesReader(tempSorted);
      
       PairOutputs<Long,BytesRef> outputs = new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
       Builder<Pair<Long,BytesRef>> builder = new Builder<Pair<Long,BytesRef>>(FST.INPUT_TYPE.BYTE1, outputs);
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java
index db332474c9a..f425235e272 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java
@@ -20,6 +20,7 @@ package org.apache.lucene.search.suggest.analyzing;
 // TODO
 //   - test w/ syns
 //   - add pruning of low-freq ngrams?
+
 import java.io.File;
 import java.io.IOException;
 //import java.io.PrintWriter;
@@ -54,7 +55,6 @@ import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.suggest.InputIterator;
 import org.apache.lucene.search.suggest.Lookup;
-import org.apache.lucene.search.suggest.Sort;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
@@ -74,6 +74,7 @@ import org.apache.lucene.util.fst.Outputs;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
 import org.apache.lucene.util.fst.Util.MinResult;
 import org.apache.lucene.util.fst.Util;
+import org.apache.lucene.util.OfflineSorter;
 
 /**
  * Builds an ngram model from the text sent to {@link
@@ -287,7 +288,7 @@ public class FreeTextSuggester extends Lookup {
     }
 
     String prefix = getClass().getSimpleName();
-    File directory = Sort.defaultTempDir();
+    File directory = OfflineSorter.defaultTempDir();
     // TODO: messy ... java7 has Files.createTempDirectory
     // ... but 4.x is java6:
     File tempIndexPath = null;
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java
index 0a06b861e83..8ceb937e74d 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java
@@ -17,14 +17,15 @@ package org.apache.lucene.search.suggest.fst;
  * limitations under the License.
  */
 
-import java.io.*;
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
 import java.util.Comparator;
 
-import org.apache.lucene.search.suggest.Sort;
-import org.apache.lucene.search.suggest.Sort.ByteSequencesReader;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefIterator;
 import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.OfflineSorter;
 
 /**
  * Builds and iterates over sequences stored on disk.
@@ -32,19 +33,19 @@ import org.apache.lucene.util.IOUtils;
  * @lucene.internal
  */
 public class ExternalRefSorter implements BytesRefSorter, Closeable {
-  private final Sort sort;
-  private Sort.ByteSequencesWriter writer;
+  private final OfflineSorter sort;
+  private OfflineSorter.ByteSequencesWriter writer;
   private File input;
   private File sorted;
   
   /**
    * Will buffer all sequences to a temporary file and then sort (all on-disk).
    */
-  public ExternalRefSorter(Sort sort) throws IOException {
+  public ExternalRefSorter(OfflineSorter sort) throws IOException {
     this.sort = sort;
     this.input = File.createTempFile("RefSorter-", ".raw",
-        Sort.defaultTempDir());
-    this.writer = new Sort.ByteSequencesWriter(input);
+        OfflineSorter.defaultTempDir());
+    this.writer = new OfflineSorter.ByteSequencesWriter(input);
   }
   
   @Override
@@ -59,14 +60,14 @@ public class ExternalRefSorter implements BytesRefSorter, Closeable {
       closeWriter();
       
       sorted = File.createTempFile("RefSorter-", ".sorted",
-          Sort.defaultTempDir());
+          OfflineSorter.defaultTempDir());
       sort.sort(input, sorted);
       
       input.delete();
       input = null;
     }
     
-    return new ByteSequenceIterator(new Sort.ByteSequencesReader(sorted));
+    return new ByteSequenceIterator(new OfflineSorter.ByteSequencesReader(sorted));
   }
   
   private void closeWriter() throws IOException {
@@ -93,10 +94,10 @@ public class ExternalRefSorter implements BytesRefSorter, Closeable {
    * Iterate over byte refs in a file.
    */
   class ByteSequenceIterator implements BytesRefIterator {
-    private final ByteSequencesReader reader;
+    private final OfflineSorter.ByteSequencesReader reader;
     private BytesRef scratch = new BytesRef();
     
-    public ByteSequenceIterator(ByteSequencesReader reader) {
+    public ByteSequenceIterator(OfflineSorter.ByteSequencesReader reader) {
       this.reader = reader;
     }
     
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java
index 38132cad444..704c1fb9c5b 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java
@@ -19,26 +19,27 @@ package org.apache.lucene.search.suggest.fst;
 
 import java.io.File;
 import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
 import java.util.ArrayList;
 import java.util.List;
 
 import org.apache.lucene.search.suggest.InputIterator;
 import org.apache.lucene.search.suggest.Lookup;
-import org.apache.lucene.search.suggest.Sort.SortInfo;
-import org.apache.lucene.search.suggest.Sort;
 import org.apache.lucene.search.suggest.fst.FSTCompletion.Completion;
 import org.apache.lucene.search.suggest.tst.TSTLookup;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.ByteArrayDataOutput;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
-import org.apache.lucene.store.InputStreamDataInput;
-import org.apache.lucene.store.OutputStreamDataOutput;
-import org.apache.lucene.util.*;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.NoOutputs;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.OfflineSorter;
+import org.apache.lucene.util.OfflineSorter.SortInfo;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.UnicodeUtil;
 
 /**
  * An adapter from {@link Lookup} API to {@link FSTCompletion}.
@@ -150,12 +151,12 @@ public class FSTCompletionLookup extends Lookup {
       throw new IllegalArgumentException("this suggester doesn't support payloads");
     }
     File tempInput = File.createTempFile(
-        FSTCompletionLookup.class.getSimpleName(), ".input", Sort.defaultTempDir());
+        FSTCompletionLookup.class.getSimpleName(), ".input", OfflineSorter.defaultTempDir());
     File tempSorted = File.createTempFile(
-        FSTCompletionLookup.class.getSimpleName(), ".sorted", Sort.defaultTempDir());
+        FSTCompletionLookup.class.getSimpleName(), ".sorted", OfflineSorter.defaultTempDir());
 
-    Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
-    Sort.ByteSequencesReader reader = null;
+    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
+    OfflineSorter.ByteSequencesReader reader = null;
     ExternalRefSorter sorter = null;
 
     // Push floats up front before sequences to sort them. For now, assume they are non-negative.
@@ -180,13 +181,13 @@ public class FSTCompletionLookup extends Lookup {
 
       // We don't know the distribution of scores and we need to bucket them, so we'll sort
       // and divide into equal buckets.
-      SortInfo info = new Sort().sort(tempInput, tempSorted);
+      SortInfo info = new OfflineSorter().sort(tempInput, tempSorted);
       tempInput.delete();
       FSTCompletionBuilder builder = new FSTCompletionBuilder(
-          buckets, sorter = new ExternalRefSorter(new Sort()), sharedTailLength);
+          buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength);
 
       final int inputLines = info.lines;
-      reader = new Sort.ByteSequencesReader(tempSorted);
+      reader = new OfflineSorter.ByteSequencesReader(tempSorted);
       long line = 0;
       int previousBucket = 0;
       int previousScore = 0;
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java
index ab668413831..d654f182e48 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java
@@ -25,7 +25,6 @@ import java.util.List;
 
 import org.apache.lucene.search.suggest.InputIterator;
 import org.apache.lucene.search.suggest.Lookup;
-import org.apache.lucene.search.suggest.Sort.ByteSequencesWriter;
 import org.apache.lucene.search.suggest.SortedInputIterator;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.ByteArrayDataOutput;
@@ -43,6 +42,7 @@ import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
 import org.apache.lucene.util.fst.Util.MinResult;
 import org.apache.lucene.util.fst.Util;
+import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
 
 /**
  * Suggester based on a weighted FST: it first traverses the prefix, 
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java
index f8ccd35b55c..82775898475 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java
@@ -18,16 +18,16 @@ package org.apache.lucene.search.suggest.fst;
  */
 
 import org.apache.lucene.search.suggest.InMemorySorter;
-import org.apache.lucene.search.suggest.Sort;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefIterator;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.OfflineSorter;
 import org.junit.Test;
 
 public class BytesRefSortersTest extends LuceneTestCase {
   @Test
   public void testExternalRefSorter() throws Exception {
-    ExternalRefSorter s = new ExternalRefSorter(new Sort());
+    ExternalRefSorter s = new ExternalRefSorter(new OfflineSorter());
     check(s);
     s.close();
   }
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/LargeInputFST.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/LargeInputFST.java
index 48a1409d9f9..0cb6c668d02 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/LargeInputFST.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/LargeInputFST.java
@@ -17,10 +17,15 @@ package org.apache.lucene.search.suggest.fst;
  * limitations under the License.
  */
 
-import java.io.*;
 
-import org.apache.lucene.search.suggest.Sort;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.OfflineSorter;
 
 /**
  * Try to build a suggester from a large data set. The input is a simple text
@@ -33,7 +38,7 @@ public class LargeInputFST {
     int buckets = 20;
     int shareMaxTail = 10;
 
-    ExternalRefSorter sorter = new ExternalRefSorter(new Sort());
+    ExternalRefSorter sorter = new ExternalRefSorter(new OfflineSorter());
     FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter, shareMaxTail);
 
     BufferedReader reader = new BufferedReader(

From 803226ece41e9b10eafcc11664f943b9ae3db8b7 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Mon, 24 Feb 2014 17:11:07 +0000
Subject: [PATCH 06/17] LUCENE-5468: sort dictionary data with offline sorter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571356 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene/analysis/hunspell2/Dictionary.java | 72 ++++++++++++-------
 1 file changed, 48 insertions(+), 24 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
index c8068aa41b9..10baa403413 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
@@ -20,7 +20,11 @@ package org.apache.lucene.analysis.hunspell2;
 import org.apache.lucene.analysis.util.CharArrayMap;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefHash;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.OfflineSorter;
+import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
+import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.Version;
 import org.apache.lucene.util.fst.Builder;
@@ -75,6 +79,8 @@ public class Dictionary {
 
   private String[] aliases;
   private int aliasCount = 0;
+  
+  private final File tempDir = OfflineSorter.defaultTempDir(); // TODO: make this configurable?
 
   /**
    * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
@@ -364,38 +370,53 @@ public class Dictionary {
     BytesRef flagsScratch = new BytesRef();
     IntsRef scratchInts = new IntsRef();
     
-    BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder));
-    // TODO: don't create millions of strings.
-    String line = reader.readLine(); // first line is number of entries (approximately, sometimes)
-    // sometimes the number of entries has a comment/copyright after it
-    line = line.replaceFirst("\\s*\\#.*$", "");
-    int numEntries = Integer.parseInt(line);
+    BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
+    String line = lines.readLine(); // first line is number of entries (approximately, sometimes)
     
-    String lines[] = new String[numEntries];
-    int upto = 0;
-    while ((line = reader.readLine()) != null) {
-      if (upto == lines.length) {
-        lines = Arrays.copyOf(lines, (int)(lines.length * 1.25));
+    File unsorted = File.createTempFile("unsorted", "dat", tempDir);
+    try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
+      while ((line = lines.readLine()) != null) {
+        writer.write(line.getBytes(IOUtils.CHARSET_UTF_8));
       }
-      lines[upto++] = line;
     }
+    File sorted = File.createTempFile("sorted", "dat", tempDir);
     
-    // TODO: just replace this with offline sorter?
-    Arrays.sort(lines, 0, upto, new Comparator<String>() {
+    OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() {
+      BytesRef scratch1 = new BytesRef();
+      BytesRef scratch2 = new BytesRef();
+      
       @Override
-      public int compare(String o1, String o2) {
-        int sep1 = o1.lastIndexOf('/');
-        if (sep1 >= 0) {
-          o1 = o1.substring(0, sep1);
+      public int compare(BytesRef o1, BytesRef o2) {
+        scratch1.bytes = o1.bytes;
+        scratch1.offset = o1.offset;
+        scratch1.length = o1.length;
+        
+        for (int i = scratch1.length - 1; i >= 0; i--) {
+          if (scratch1.bytes[scratch1.offset + i] == '/') {
+            scratch1.length = i;
+            break;
+          }
         }
         
-        int sep2 = o2.lastIndexOf('/');
-        if (sep2 >= 0) {
-          o2 = o2.substring(0, sep2);
+        scratch2.bytes = o2.bytes;
+        scratch2.offset = o2.offset;
+        scratch2.length = o2.length;
+        
+        for (int i = scratch2.length - 1; i >= 0; i--) {
+          if (scratch2.bytes[scratch2.offset + i] == '/') {
+            scratch2.length = i;
+            break;
+          }
         }
-        return o1.compareTo(o2);
+        
+        return scratch1.compareTo(scratch2);
       }
     });
+    sorter.sort(unsorted, sorted);
+    unsorted.delete();
+    
+    ByteSequencesReader reader = new ByteSequencesReader(sorted);
+    BytesRef scratchLine = new BytesRef();
     
     // TODO: the flags themselves can be double-chars (long) or also numeric
     // either way the trick is to encode them as char... but they must be parsed differently
@@ -403,8 +424,8 @@ public class Dictionary {
     BytesRef currentEntry = new BytesRef();
     char currentFlags[] = new char[0];
     
-    for (int i = 0; i < upto; i++) {
-      line = lines[i];
+    while (reader.read(scratchLine)) {
+      line = scratchLine.utf8ToString();
       String entry;
       char wordForm[];
       
@@ -457,6 +478,9 @@ public class Dictionary {
     }
     UnicodeUtil.UTF8toUTF32(currentEntry, scratchInts);
     words.add(scratchInts, (long)ord);
+    
+    reader.close();
+    sorted.delete();
   }
   
   static char[] decodeFlags(BytesRef b) {

From 10f548d205e9443872c919f7af0ac1b01c735ed3 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Tue, 25 Feb 2014 19:18:09 +0000
Subject: [PATCH 07/17] LUCENE-5468: deduplicate patterns used by affix
 condition check

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571788 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene/analysis/hunspell2/Affix.java      | 16 ++---------
 .../lucene/analysis/hunspell2/Dictionary.java | 27 ++++++++++++++++---
 .../hunspell2/TestAllDictionaries.java        | 10 ++++---
 3 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
index 41c3553fb77..443c006c97d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
@@ -28,7 +28,6 @@ final class Affix {
   private char appendFlags[]; // continuation class flags
   private String strip;
   
-  private String condition;
   private Pattern conditionPattern;
   
   private char flag;
@@ -99,24 +98,13 @@ final class Affix {
     this.strip = strip;
   }
 
-  /**
-   * Returns the condition that must be met before the affix can be applied
-   *
-   * @return Condition that must be met before the affix can be applied
-   */
-  public String getCondition() {
-    return condition;
-  }
-
   /**
    * Sets the condition that must be met before the affix can be applied
    *
-   * @param condition Condition to be met before affix application
    * @param pattern Condition as a regular expression pattern
    */
-  public void setCondition(String condition, String pattern) {
-    this.condition = condition;
-    this.conditionPattern = Pattern.compile(pattern);
+  public void setCondition(Pattern pattern) {
+    this.conditionPattern = pattern;
   }
 
   /**
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
index 10baa403413..0456d9946d3 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
@@ -44,6 +44,7 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.regex.Pattern;
 
 /**
  * In-memory structure for the dictionary (.dic) and affix (.aff)
@@ -68,6 +69,12 @@ public class Dictionary {
   public CharArrayMap<List<Affix>> prefixes;
   public CharArrayMap<List<Affix>> suffixes;
   
+  // all Patterns used by prefixes and suffixes. these are typically re-used across
+  // many affix stripping rules. so these are deduplicated, to save RAM.
+  // TODO: maybe don't use Pattern for the condition check...
+  // TODO: when we cut over Affix to FST, just store integer index to this.
+  public ArrayList<Pattern> patterns = new ArrayList<>();
+  
   // the entries in the .dic file, mapping to their set of flags.
   // the fst output is the ordinal for flagLookup
   public FST<Long> words;
@@ -184,6 +191,7 @@ public class Dictionary {
   private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
     prefixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
     suffixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
+    Map<String,Integer> seenPatterns = new HashMap<>();
 
     LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
     String line = null;
@@ -191,9 +199,9 @@ public class Dictionary {
       if (line.startsWith(ALIAS_KEY)) {
         parseAlias(line);
       } else if (line.startsWith(PREFIX_KEY)) {
-        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
+        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns);
       } else if (line.startsWith(SUFFIX_KEY)) {
-        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
+        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns);
       } else if (line.startsWith(FLAG_KEY)) {
         // Assume that the FLAG line comes before any prefix or suffixes
         // Store the strategy so it can be used when parsing the dic file
@@ -210,12 +218,14 @@ public class Dictionary {
    * @param reader BufferedReader to read the content of the rule from
    * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
    *                         pattern
+   * @param seenPatterns map from condition -> index of patterns, for deduplication.
    * @throws IOException Can be thrown while reading the rule
    */
   private void parseAffix(CharArrayMap<List<Affix>> affixes,
                           String header,
                           LineNumberReader reader,
-                          String conditionPattern) throws IOException, ParseException {
+                          String conditionPattern,
+                          Map<String,Integer> seenPatterns) throws IOException, ParseException {
     String args[] = header.split("\\s+");
 
     boolean crossProduct = args[2].equals("Y");
@@ -261,7 +271,16 @@ public class Dictionary {
       if (condition.indexOf('-') >= 0) {
         condition = condition.replace("-", "\\-");
       }
-      affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition));
+      // deduplicate patterns
+      String regex = String.format(Locale.ROOT, conditionPattern, condition);
+      Integer patternIndex = seenPatterns.get(regex);
+      if (patternIndex == null) {
+        patternIndex = patterns.size();
+        seenPatterns.put(regex, patternIndex);
+        Pattern pattern = Pattern.compile(regex);
+        patterns.add(pattern);
+      }
+      affix.setCondition(patterns.get(patternIndex));
       affix.setCrossProduct(crossProduct);
       
       List<Affix> list = affixes.get(affix.getAppend());
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
index ecb21b97a7c..9f9bce98236 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
@@ -33,12 +33,12 @@ import org.junit.Ignore;
  * wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/
  * Note some of the files differ only in case. This may be a problem on your operating system!
  */
-@Ignore("enable manually")
+//@Ignore("enable manually")
 public class TestAllDictionaries extends LuceneTestCase {
   
   // set this to the location of where you downloaded all the files
   static final File DICTIONARY_HOME = 
-      new File("/Users/rmuir/hunspell/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries");
+      new File("/data/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries");
   
   final String tests[] = {
     /* zip file */               /* dictionary */       /* affix */
@@ -176,7 +176,11 @@ public class TestAllDictionaries extends LuceneTestCase {
         try (InputStream dictionary = zip.getInputStream(dicEntry);
              InputStream affix = zip.getInputStream(affEntry)) {
           Dictionary dic = new Dictionary(affix, dictionary);
-          System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic));
+          System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
+                             "words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
+                             "flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
+                             "prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +
+                             "suffixes=" + RamUsageEstimator.humanSizeOf(dic.suffixes) + ")");
         }
       }
     }

From 48f55644505c5bf553b2225f9559c351621194b0 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Tue, 25 Feb 2014 19:44:32 +0000
Subject: [PATCH 08/17] LUCENE-5468: remove redundant 'append' in Affix

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571802 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene/analysis/hunspell2/Affix.java      | 19 -------------------
 .../lucene/analysis/hunspell2/Dictionary.java |  8 +++-----
 .../lucene/analysis/hunspell2/Stemmer.java    |  5 +++--
 3 files changed, 6 insertions(+), 26 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
index 443c006c97d..47a81480d6c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
@@ -24,7 +24,6 @@ import java.util.regex.Pattern;
  */
 final class Affix {
 
-  private String append; // the affix itself, what is appended
   private char appendFlags[]; // continuation class flags
   private String strip;
   
@@ -44,24 +43,6 @@ final class Affix {
     return conditionPattern.matcher(text).matches();
   }
 
-  /**
-   * Returns the append defined for the affix
-   *
-   * @return Defined append
-   */
-  public String getAppend() {
-    return append;
-  }
-
-  /**
-   * Sets the append defined for the affix
-   *
-   * @param append Defined append for the affix
-   */
-  public void setAppend(String append) {
-    this.append = append;
-  }
-
   /**
    * Returns the flags defined for the affix append
    *
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
index 0456d9946d3..713bc92210c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
@@ -257,9 +257,7 @@ public class Dictionary {
         char appendFlags[] = flagParsingStrategy.parseFlags(flagPart);
         Arrays.sort(appendFlags);
         affix.setAppendFlags(appendFlags);
-        affix.setAppend(affixArg.substring(0, flagSep));
-      } else {
-        affix.setAppend(affixArg);
+        affixArg = affixArg.substring(0, flagSep);
       }
 
       String condition = ruleArgs[4];
@@ -283,10 +281,10 @@ public class Dictionary {
       affix.setCondition(patterns.get(patternIndex));
       affix.setCrossProduct(crossProduct);
       
-      List<Affix> list = affixes.get(affix.getAppend());
+      List<Affix> list = affixes.get(affixArg);
       if (list == null) {
         list = new ArrayList<Affix>();
-        affixes.put(affix.getAppend(), list);
+        affixes.put(affixArg, list);
       }
       
       list.add(affix);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
index aa00836d6fe..62096ef96cf 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
@@ -126,7 +126,8 @@ final class Stemmer {
 
       for (Affix suffix : suffixes) {
         if (hasCrossCheckedFlag(suffix.getFlag(), flags)) {
-          int deAffixedLength = length - suffix.getAppend().length();
+          int appendLength = length - i;
+          int deAffixedLength = length - appendLength;
           // TODO: can we do this in-place?
           String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();
 
@@ -148,7 +149,7 @@ final class Stemmer {
 
       for (Affix prefix : prefixes) {
         if (hasCrossCheckedFlag(prefix.getFlag(), flags)) {
-          int deAffixedStart = prefix.getAppend().length();
+          int deAffixedStart = i;
           int deAffixedLength = length - deAffixedStart;
 
           String strippedWord = new StringBuilder().append(prefix.getStrip())

From caaa01d2207b47e0f917760f09b9288710f1615c Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Tue, 25 Feb 2014 20:07:05 +0000
Subject: [PATCH 09/17] LUCENE-5468: Stem -> CharsRef

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571807 13f79535-47bb-0310-9956-ffa450edef68
---
 .../hunspell2/Hunspell2StemFilter.java        | 13 ++-
 .../lucene/analysis/hunspell2/Stem.java       | 98 -------------------
 .../lucene/analysis/hunspell2/Stemmer.java    | 43 ++++----
 .../analysis/hunspell2/TestStemmer.java       |  5 +-
 4 files changed, 28 insertions(+), 131 deletions(-)
 delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stem.java

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java
index 45941345342..00ff88469be 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java
@@ -25,6 +25,7 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.CharsRef;
 
 /**
  * TokenFilter that uses hunspell affix rules and words to stem tokens.  Since hunspell supports a word having multiple
@@ -49,7 +50,7 @@ public final class Hunspell2StemFilter extends TokenFilter {
   private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
   private final Stemmer stemmer;
   
-  private List<Stem> buffer;
+  private List<CharsRef> buffer;
   private State savedState;
   
   private final boolean dedup;
@@ -97,11 +98,10 @@ public final class Hunspell2StemFilter extends TokenFilter {
   @Override
   public boolean incrementToken() throws IOException {
     if (buffer != null && !buffer.isEmpty()) {
-      Stem nextStem = buffer.remove(0);
+      CharsRef nextStem = buffer.remove(0);
       restoreState(savedState);
       posIncAtt.setPositionIncrement(0);
-      termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength());
-      termAtt.setLength(nextStem.getStemLength());
+      termAtt.setEmpty().append(nextStem);
       return true;
     }
     
@@ -119,9 +119,8 @@ public final class Hunspell2StemFilter extends TokenFilter {
       return true;
     }     
 
-    Stem stem = buffer.remove(0);
-    termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength());
-    termAtt.setLength(stem.getStemLength());
+    CharsRef stem = buffer.remove(0);
+    termAtt.setEmpty().append(stem);
 
     if (!buffer.isEmpty()) {
       savedState = captureState();
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stem.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stem.java
deleted file mode 100644
index d3c8d4c86ab..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stem.java
+++ /dev/null
@@ -1,98 +0,0 @@
-package org.apache.lucene.analysis.hunspell2;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * Stem represents all information known about a stem of a word.  This includes the stem, and the prefixes and suffixes
- * that were used to change the word into the stem.
- */
-final class Stem {
-  final List<Affix> prefixes = new ArrayList<Affix>();
-  final List<Affix> suffixes = new ArrayList<Affix>();
-  final char stem[];
-  final int stemLength;
-
-  /**
-   * Creates a new Stem wrapping the given word stem
-   *
-   * @param stem Stem of a word
-   */
-  public Stem(char stem[], int stemLength) {
-    this.stem = stem;
-    this.stemLength = stemLength;
-  }
-
-  /**
-   * Adds a prefix to the list of prefixes used to generate this stem.  Because it is assumed that prefixes are added
-   * depth first, the prefix is added to the front of the list
-   *
-   * @param prefix Prefix to add to the list of prefixes for this stem
-   */
-  public void addPrefix(Affix prefix) {
-    prefixes.add(0, prefix);
-  }
-
-  /**
-   * Adds a suffix to the list of suffixes used to generate this stem.  Because it is assumed that suffixes are added
-   * depth first, the suffix is added to the end of the list
-   *
-   * @param suffix Suffix to add to the list of suffixes for this stem
-   */
-  public void addSuffix(Affix suffix) {
-    suffixes.add(suffix);
-  }
-
-  /**
-   * Returns the list of prefixes used to generate the stem
-   *
-   * @return List of prefixes used to generate the stem or an empty list if no prefixes were required
-   */
-  public List<Affix> getPrefixes() {
-    return prefixes;
-  }
-
-  /**
-   * Returns the list of suffixes used to generate the stem
-   * 
-   * @return List of suffixes used to generate the stem or an empty list if no suffixes were required
-   */
-  public List<Affix> getSuffixes() {
-    return suffixes;
-  }
-
-  /**
-   * Returns the text of the word's stem.
-   * @see #getStemLength()
-   */
-  public char[] getStem() {
-    return stem;
-  }
-
-  /** Returns the valid length of the text in {@link #getStem()} */
-  public int getStemLength() {
-    return stemLength;
-  }
-  
-  /** Only use this if you really need a string (e.g. for testing) */
-  public String getStemString() {
-    return new String(stem, 0, stemLength);
-  }
-}
\ No newline at end of file
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
index 62096ef96cf..7919ad56be7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
@@ -24,6 +24,7 @@ import java.util.List;
 
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.Version;
 
 /**
@@ -63,7 +64,7 @@ final class Stemmer {
    * @param word Word to find the stems for
    * @return List of stems for the word
    */
-  public List<Stem> stem(String word) {
+  public List<CharsRef> stem(String word) {
     return stem(word.toCharArray(), word.length());
   }
 
@@ -73,10 +74,10 @@ final class Stemmer {
    * @param word Word to find the stems for
    * @return List of stems for the word
    */
-  public List<Stem> stem(char word[], int length) {
-    List<Stem> stems = new ArrayList<Stem>();
+  public List<CharsRef> stem(char word[], int length) {
+    List<CharsRef> stems = new ArrayList<CharsRef>();
     if (dictionary.lookupWord(word, 0, length, scratch) != null) {
-      stems.add(new Stem(word, length));
+      stems.add(new CharsRef(word, 0, length));
     }
     stems.addAll(stem(word, length, null, 0));
     return stems;
@@ -88,18 +89,18 @@ final class Stemmer {
    * @param word Word to find the stems for
    * @return List of stems for the word
    */
-  public List<Stem> uniqueStems(char word[], int length) {
-    List<Stem> stems = new ArrayList<Stem>();
+  public List<CharsRef> uniqueStems(char word[], int length) {
+    List<CharsRef> stems = new ArrayList<CharsRef>();
     CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false);
     if (dictionary.lookupWord(word, 0, length, scratch) != null) {
-      stems.add(new Stem(word, length));
+      stems.add(new CharsRef(word, 0, length));
       terms.add(word);
     }
-    List<Stem> otherStems = stem(word, length, null, 0);
-    for (Stem s : otherStems) {
-      if (!terms.contains(s.stem)) {
+    List<CharsRef> otherStems = stem(word, length, null, 0);
+    for (CharsRef s : otherStems) {
+      if (!terms.contains(s)) {
         stems.add(s);
-        terms.add(s.stem);
+        terms.add(s);
       }
     }
     return stems;
@@ -115,8 +116,8 @@ final class Stemmer {
    * @param recursionDepth Level of recursion this stemming step is at
    * @return List of stems, or empty list if no stems are found
    */
-  private List<Stem> stem(char word[], int length, char[] flags, int recursionDepth) {
-    List<Stem> stems = new ArrayList<Stem>();
+  private List<CharsRef> stem(char word[], int length, char[] flags, int recursionDepth) {
+    List<CharsRef> stems = new ArrayList<CharsRef>();
 
     for (int i = 0; i < length; i++) {
       List<Affix> suffixes = dictionary.lookupSuffix(word, i, length - i);
@@ -131,10 +132,7 @@ final class Stemmer {
           // TODO: can we do this in-place?
           String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();
 
-          List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
-          for (Stem stem : stemList) {
-            stem.addSuffix(suffix);
-          }
+          List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
 
           stems.addAll(stemList);
         }
@@ -156,10 +154,7 @@ final class Stemmer {
               .append(word, deAffixedStart, deAffixedLength)
               .toString();
 
-          List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
-          for (Stem stem : stemList) {
-            stem.addPrefix(prefix);
-          }
+          List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
 
           stems.addAll(stemList);
         }
@@ -177,18 +172,18 @@ final class Stemmer {
    * @param recursionDepth Level of recursion this stemming step is at
    * @return List of stems for the word, or an empty list if none are found
    */
-  public List<Stem> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
+  public List<CharsRef> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
     segment.setLength(0);
     segment.append(strippedWord, 0, length);
     if (!affix.checkCondition(segment)) {
       return Collections.emptyList();
     }
 
-    List<Stem> stems = new ArrayList<Stem>();
+    List<CharsRef> stems = new ArrayList<CharsRef>();
 
     char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch);
     if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) {
-      stems.add(new Stem(strippedWord, length));
+      stems.add(new CharsRef(strippedWord, 0, length));
     }
 
     if (affix.isCrossProduct() && recursionDepth < recursionCap) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java
index a8ac2a83fa9..4dec107f314 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java
@@ -17,6 +17,7 @@ package org.apache.lucene.analysis.hunspell2;
  * limitations under the License.
  */
 
+import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
@@ -92,10 +93,10 @@ public class TestStemmer extends LuceneTestCase {
   private void assertStemsTo(String s, String... expected) {
     Arrays.sort(expected);
     
-    List<Stem> stems = stemmer.stem(s);
+    List<CharsRef> stems = stemmer.stem(s);
     String actual[] = new String[stems.size()];
     for (int i = 0; i < actual.length; i++) {
-      actual[i] = stems.get(i).getStemString();
+      actual[i] = stems.get(i).toString();
     }
     Arrays.sort(actual);
     

From d7cc408585363c1b2ea5d7c725515829cc4f5ff7 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Tue, 25 Feb 2014 22:29:27 +0000
Subject: [PATCH 10/17] LUCENE-5468: make Affix fixed-width

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571844 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene/analysis/hunspell2/Affix.java      | 41 ++++++----------
 .../lucene/analysis/hunspell2/Dictionary.java | 47 +++++++++++++++----
 .../lucene/analysis/hunspell2/Stemmer.java    | 24 +++++++---
 3 files changed, 68 insertions(+), 44 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
index 47a81480d6c..eb67f60e763 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
@@ -17,38 +17,23 @@ package org.apache.lucene.analysis.hunspell2;
  * limitations under the License.
  */
 
-import java.util.regex.Pattern;
-
 /**
  * Wrapper class representing a hunspell affix
  */
 final class Affix {
 
-  private char appendFlags[]; // continuation class flags
-  private String strip;
-  
-  private Pattern conditionPattern;
-  
-  private char flag;
-
+  private int appendFlags; // continuation class flags
+  private int condition; // check condition
   private boolean crossProduct;
-
-  /**
-   * Checks whether the given text matches the conditional pattern on this affix
-   *
-   * @param text Text to check if it matches the affix's conditional pattern
-   * @return {@code true} if the text meets the condition, {@code false} otherwise
-   */
-  public boolean checkCondition(CharSequence text) {
-    return conditionPattern.matcher(text).matches();
-  }
+  private char flag;
+  private int strip;
 
   /**
    * Returns the flags defined for the affix append
    *
    * @return Flags defined for the affix append
    */
-  public char[] getAppendFlags() {
+  public int getAppendFlags() {
     return appendFlags;
   }
 
@@ -57,7 +42,7 @@ final class Affix {
    *
    * @param appendFlags Flags defined for the affix append
    */
-  public void setAppendFlags(char[] appendFlags) {
+  public void setAppendFlags(int appendFlags) {
     this.appendFlags = appendFlags;
   }
 
@@ -66,7 +51,7 @@ final class Affix {
    *
    * @return Stripping characters defined for the affix
    */
-  public String getStrip() {
+  public int getStrip() {
     return strip;
   }
 
@@ -75,17 +60,19 @@ final class Affix {
    *
    * @param strip Stripping characters defined for the affix
    */
-  public void setStrip(String strip) {
+  public void setStrip(int strip) {
     this.strip = strip;
   }
 
   /**
    * Sets the condition that must be met before the affix can be applied
-   *
-   * @param pattern Condition as a regular expression pattern
    */
-  public void setCondition(Pattern pattern) {
-    this.conditionPattern = pattern;
+  public void setCondition(int condition) {
+    this.condition = condition;
+  }
+  
+  public int getCondition() {
+    return condition;
   }
 
   /**
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
index 713bc92210c..35c7aee6081 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
@@ -81,6 +81,9 @@ public class Dictionary {
   // the list of unique flagsets (wordforms). theoretically huge, but practically
   // small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either.
   public BytesRefHash flagLookup = new BytesRefHash();
+  
+  // the list of unique strip affixes.
+  public BytesRefHash stripLookup = new BytesRefHash();
 
   private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
 
@@ -107,6 +110,7 @@ public class Dictionary {
     CharsetDecoder decoder = getJavaEncoding(encoding);
     readAffixFile(buffered, decoder);
     flagLookup.add(new BytesRef()); // no flags -> ord 0
+    stripLookup.add(new BytesRef()); // no strip -> ord 0
     PositiveIntOutputs o = PositiveIntOutputs.getSingleton();
     Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE4, o);
     readDictionaryFile(dictionary, decoder, b);
@@ -226,6 +230,8 @@ public class Dictionary {
                           LineNumberReader reader,
                           String conditionPattern,
                           Map<String,Integer> seenPatterns) throws IOException, ParseException {
+    
+    BytesRef scratch = new BytesRef();
     String args[] = header.split("\\s+");
 
     boolean crossProduct = args[2].equals("Y");
@@ -239,25 +245,23 @@ public class Dictionary {
           throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber());
       }
 
-      Affix affix = new Affix();
       
-      affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
-      affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]);
-
+      char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
+      String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
       String affixArg = ruleArgs[3];
+      char appendFlags[] = null;
       
       int flagSep = affixArg.lastIndexOf('/');
       if (flagSep != -1) {
         String flagPart = affixArg.substring(flagSep + 1);
-        
+        affixArg = affixArg.substring(0, flagSep);
+
         if (aliasCount > 0) {
           flagPart = getAliasValue(Integer.parseInt(flagPart));
         } 
         
-        char appendFlags[] = flagParsingStrategy.parseFlags(flagPart);
+        appendFlags = flagParsingStrategy.parseFlags(flagPart);
         Arrays.sort(appendFlags);
-        affix.setAppendFlags(appendFlags);
-        affixArg = affixArg.substring(0, flagSep);
       }
 
       String condition = ruleArgs[4];
@@ -269,8 +273,10 @@ public class Dictionary {
       if (condition.indexOf('-') >= 0) {
         condition = condition.replace("-", "\\-");
       }
-      // deduplicate patterns
+
       String regex = String.format(Locale.ROOT, conditionPattern, condition);
+      
+      // deduplicate patterns
       Integer patternIndex = seenPatterns.get(regex);
       if (patternIndex == null) {
         patternIndex = patterns.size();
@@ -278,8 +284,29 @@ public class Dictionary {
         Pattern pattern = Pattern.compile(regex);
         patterns.add(pattern);
       }
-      affix.setCondition(patterns.get(patternIndex));
+      
+      Affix affix = new Affix();
+      scratch.copyChars(strip);
+      int ord = stripLookup.add(scratch);
+      if (ord < 0) {
+        // already exists in our hash
+        ord = (-ord)-1;
+      }
+      affix.setStrip(ord);
+      affix.setFlag(flag);
+      affix.setCondition(patternIndex);
       affix.setCrossProduct(crossProduct);
+      if (appendFlags == null) {
+        appendFlags = NOFLAGS;
+      }
+      
+      final int hashCode = encodeFlagsWithHash(scratch, appendFlags);
+      ord = flagLookup.add(scratch, hashCode);
+      if (ord < 0) {
+        // already exists in our hash
+        ord = (-ord)-1;
+      }
+      affix.setAppendFlags(ord);
       
       List<Affix> list = affixes.get(affixArg);
       if (list == null) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
index 7919ad56be7..b2057c501b2 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
@@ -21,6 +21,7 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
+import java.util.regex.Pattern;
 
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.util.BytesRef;
@@ -79,7 +80,7 @@ final class Stemmer {
     if (dictionary.lookupWord(word, 0, length, scratch) != null) {
       stems.add(new CharsRef(word, 0, length));
     }
-    stems.addAll(stem(word, length, null, 0));
+    stems.addAll(stem(word, length, Dictionary.NOFLAGS, 0));
     return stems;
   }
   
@@ -96,7 +97,7 @@ final class Stemmer {
       stems.add(new CharsRef(word, 0, length));
       terms.add(word);
     }
-    List<CharsRef> otherStems = stem(word, length, null, 0);
+    List<CharsRef> otherStems = stem(word, length, Dictionary.NOFLAGS, 0);
     for (CharsRef s : otherStems) {
       if (!terms.contains(s)) {
         stems.add(s);
@@ -117,7 +118,9 @@ final class Stemmer {
    * @return List of stems, or empty list if no stems are found
    */
   private List<CharsRef> stem(char word[], int length, char[] flags, int recursionDepth) {
+    // TODO: allow this stuff to be reused by tokenfilter
     List<CharsRef> stems = new ArrayList<CharsRef>();
+    BytesRef scratch = new BytesRef();
 
     for (int i = 0; i < length; i++) {
       List<Affix> suffixes = dictionary.lookupSuffix(word, i, length - i);
@@ -130,7 +133,8 @@ final class Stemmer {
           int appendLength = length - i;
           int deAffixedLength = length - appendLength;
           // TODO: can we do this in-place?
-          String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();
+          dictionary.stripLookup.get(suffix.getStrip(), scratch);
+          String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString();
 
           List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
 
@@ -150,7 +154,8 @@ final class Stemmer {
           int deAffixedStart = i;
           int deAffixedLength = length - deAffixedStart;
 
-          String strippedWord = new StringBuilder().append(prefix.getStrip())
+          dictionary.stripLookup.get(prefix.getStrip(), scratch);
+          String strippedWord = new StringBuilder().append(scratch.utf8ToString())
               .append(word, deAffixedStart, deAffixedLength)
               .toString();
 
@@ -175,7 +180,9 @@ final class Stemmer {
   public List<CharsRef> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
     segment.setLength(0);
     segment.append(strippedWord, 0, length);
-    if (!affix.checkCondition(segment)) {
+    
+    Pattern pattern = dictionary.patterns.get(affix.getCondition());
+    if (!pattern.matcher(segment).matches()) {
       return Collections.emptyList();
     }
 
@@ -187,7 +194,10 @@ final class Stemmer {
     }
 
     if (affix.isCrossProduct() && recursionDepth < recursionCap) {
-      stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth));
+      BytesRef scratch = new BytesRef();
+      dictionary.flagLookup.get(affix.getAppendFlags(), scratch);
+      char appendFlags[] = Dictionary.decodeFlags(scratch);
+      stems.addAll(stem(strippedWord, length, appendFlags, ++recursionDepth));
     }
 
     return stems;
@@ -201,6 +211,6 @@ final class Stemmer {
    * @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise
    */
   private boolean hasCrossCheckedFlag(char flag, char[] flags) {
-    return flags == null || Arrays.binarySearch(flags, flag) >= 0;
+    return flags.length == 0 || Arrays.binarySearch(flags, flag) >= 0;
   }
 }

From 9896e610d36a68a9f331132b889bd326ae7d4163 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Thu, 27 Feb 2014 16:19:21 +0000
Subject: [PATCH 11/17] LUCENE-5468: don't create unnecessary objects

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572643 13f79535-47bb-0310-9956-ffa450edef68
---
 .../src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
index b2057c501b2..54dce381b1a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
@@ -120,7 +120,6 @@ final class Stemmer {
   private List<CharsRef> stem(char word[], int length, char[] flags, int recursionDepth) {
     // TODO: allow this stuff to be reused by tokenfilter
     List<CharsRef> stems = new ArrayList<CharsRef>();
-    BytesRef scratch = new BytesRef();
 
     for (int i = 0; i < length; i++) {
       List<Affix> suffixes = dictionary.lookupSuffix(word, i, length - i);
@@ -194,7 +193,6 @@ final class Stemmer {
     }
 
     if (affix.isCrossProduct() && recursionDepth < recursionCap) {
-      BytesRef scratch = new BytesRef();
       dictionary.flagLookup.get(affix.getAppendFlags(), scratch);
       char appendFlags[] = Dictionary.decodeFlags(scratch);
       stems.addAll(stem(strippedWord, length, appendFlags, ++recursionDepth));

From cdec14902bc86e7826c3194199dceaa40991c153 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Thu, 27 Feb 2014 17:19:15 +0000
Subject: [PATCH 12/17] LUCENE-5468: encode affix data as 8 bytes per affix,
 before cutting over to FST

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572660 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene/analysis/hunspell2/Affix.java      | 113 ------------------
 .../lucene/analysis/hunspell2/Dictionary.java |  64 ++++++----
 .../lucene/analysis/hunspell2/Stemmer.java    |  43 +++++--
 3 files changed, 71 insertions(+), 149 deletions(-)
 delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
deleted file mode 100644
index eb67f60e763..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
+++ /dev/null
@@ -1,113 +0,0 @@
-package org.apache.lucene.analysis.hunspell2;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Wrapper class representing a hunspell affix
- */
-final class Affix {
-
-  private int appendFlags; // continuation class flags
-  private int condition; // check condition
-  private boolean crossProduct;
-  private char flag;
-  private int strip;
-
-  /**
-   * Returns the flags defined for the affix append
-   *
-   * @return Flags defined for the affix append
-   */
-  public int getAppendFlags() {
-    return appendFlags;
-  }
-
-  /**
-   * Sets the flags defined for the affix append
-   *
-   * @param appendFlags Flags defined for the affix append
-   */
-  public void setAppendFlags(int appendFlags) {
-    this.appendFlags = appendFlags;
-  }
-
-  /**
-   * Returns the stripping characters defined for the affix
-   *
-   * @return Stripping characters defined for the affix
-   */
-  public int getStrip() {
-    return strip;
-  }
-
-  /**
-   * Sets the stripping characters defined for the affix
-   *
-   * @param strip Stripping characters defined for the affix
-   */
-  public void setStrip(int strip) {
-    this.strip = strip;
-  }
-
-  /**
-   * Sets the condition that must be met before the affix can be applied
-   */
-  public void setCondition(int condition) {
-    this.condition = condition;
-  }
-  
-  public int getCondition() {
-    return condition;
-  }
-
-  /**
-   * Returns the affix flag
-   *
-   * @return Affix flag
-   */
-  public char getFlag() {
-    return flag;
-  }
-
-  /**
-   * Sets the affix flag
-   *
-   * @param flag Affix flag
-   */
-  public void setFlag(char flag) {
-    this.flag = flag;
-  }
-
-  /**
-   * Returns whether the affix is defined as cross product
-   *
-   * @return {@code true} if the affix is cross product, {@code false} otherwise
-   */
-  public boolean isCrossProduct() {
-    return crossProduct;
-  }
-
-  /**
-   * Sets whether the affix is defined as cross product
-   *
-   * @param crossProduct Whether the affix is defined as cross product
-   */
-  public void setCrossProduct(boolean crossProduct) {
-    this.crossProduct = crossProduct;
-  }
-}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
index 35c7aee6081..b30bdaa1e92 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
@@ -18,6 +18,8 @@ package org.apache.lucene.analysis.hunspell2;
  */
 
 import org.apache.lucene.analysis.util.CharArrayMap;
+import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefHash;
 import org.apache.lucene.util.IOUtils;
@@ -66,8 +68,8 @@ public class Dictionary {
   private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
   private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
 
-  public CharArrayMap<List<Affix>> prefixes;
-  public CharArrayMap<List<Affix>> suffixes;
+  public CharArrayMap<List<Character>> prefixes;
+  public CharArrayMap<List<Character>> suffixes;
   
   // all Patterns used by prefixes and suffixes. these are typically re-used across
   // many affix stripping rules. so these are deduplicated, to save RAM.
@@ -84,6 +86,10 @@ public class Dictionary {
   
   // the list of unique strip affixes.
   public BytesRefHash stripLookup = new BytesRefHash();
+  
+  // 8 bytes per affix
+  public byte[] affixData = new byte[64];
+  private int currentAffix = 0;
 
   private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
 
@@ -169,7 +175,7 @@ public class Dictionary {
    * @param length Length from the offset that the String is
    * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
    */
-  public List<Affix> lookupPrefix(char word[], int offset, int length) {
+  public List<Character> lookupPrefix(char word[], int offset, int length) {
     return prefixes.get(word, offset, length);
   }
 
@@ -181,7 +187,7 @@ public class Dictionary {
    * @param length Length from the offset that the String is
    * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
    */
-  List<Affix> lookupSuffix(char word[], int offset, int length) {
+  List<Character> lookupSuffix(char word[], int offset, int length) {
     return suffixes.get(word, offset, length);
   }
 
@@ -193,8 +199,8 @@ public class Dictionary {
    * @throws IOException Can be thrown while reading from the InputStream
    */
   private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
-    prefixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
-    suffixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
+    prefixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
+    suffixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
     Map<String,Integer> seenPatterns = new HashMap<>();
 
     LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
@@ -225,7 +231,7 @@ public class Dictionary {
    * @param seenPatterns map from condition -> index of patterns, for deduplication.
    * @throws IOException Can be thrown while reading the rule
    */
-  private void parseAffix(CharArrayMap<List<Affix>> affixes,
+  private void parseAffix(CharArrayMap<List<Character>> affixes,
                           String header,
                           LineNumberReader reader,
                           String conditionPattern,
@@ -237,14 +243,20 @@ public class Dictionary {
     boolean crossProduct = args[2].equals("Y");
     
     int numLines = Integer.parseInt(args[3]);
+    affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
+    ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);
+    
     for (int i = 0; i < numLines; i++) {
+      if (currentAffix > Short.MAX_VALUE) {
+        throw new UnsupportedOperationException("Too many affixes, please report this to dev@lucene.apache.org");
+      }
+      assert affixWriter.getPosition() == currentAffix << 3;
       String line = reader.readLine();
       String ruleArgs[] = line.split("\\s+");
 
       if (ruleArgs.length < 5) {
           throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber());
       }
-
       
       char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
       String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
@@ -285,36 +297,42 @@ public class Dictionary {
         patterns.add(pattern);
       }
       
-      Affix affix = new Affix();
       scratch.copyChars(strip);
-      int ord = stripLookup.add(scratch);
-      if (ord < 0) {
+      int stripOrd = stripLookup.add(scratch);
+      if (stripOrd < 0) {
         // already exists in our hash
-        ord = (-ord)-1;
+        stripOrd = (-stripOrd)-1;
       }
-      affix.setStrip(ord);
-      affix.setFlag(flag);
-      affix.setCondition(patternIndex);
-      affix.setCrossProduct(crossProduct);
+
       if (appendFlags == null) {
         appendFlags = NOFLAGS;
       }
       
       final int hashCode = encodeFlagsWithHash(scratch, appendFlags);
-      ord = flagLookup.add(scratch, hashCode);
-      if (ord < 0) {
+      int appendFlagsOrd = flagLookup.add(scratch, hashCode);
+      if (appendFlagsOrd < 0) {
         // already exists in our hash
-        ord = (-ord)-1;
+        appendFlagsOrd = (-appendFlagsOrd)-1;
+      } else if (appendFlagsOrd > Short.MAX_VALUE) {
+        // this limit is probably flexible, but its a good sanity check too
+        throw new UnsupportedOperationException("Too many unique flags, please report this to dev@lucene.apache.org");
       }
-      affix.setAppendFlags(ord);
       
-      List<Affix> list = affixes.get(affixArg);
+      affixWriter.writeShort((short)flag);
+      affixWriter.writeShort((short)stripOrd);
+      // encode crossProduct into patternIndex
+      int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
+      affixWriter.writeShort((short)patternOrd);
+      affixWriter.writeShort((short)appendFlagsOrd);
+      
+      List<Character> list = affixes.get(affixArg);
       if (list == null) {
-        list = new ArrayList<Affix>();
+        list = new ArrayList<Character>();
         affixes.put(affixArg, list);
       }
       
-      list.add(affix);
+      list.add((char)currentAffix);
+      currentAffix++;
     }
   }
 
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
index 54dce381b1a..4eaff6a9e95 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
@@ -24,6 +24,7 @@ import java.util.List;
 import java.util.regex.Pattern;
 
 import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.Version;
@@ -37,6 +38,7 @@ final class Stemmer {
   private final Dictionary dictionary;
   private BytesRef scratch = new BytesRef();
   private final StringBuilder segment = new StringBuilder();
+  private final ByteArrayDataInput affixReader;
 
   /**
    * Constructs a new Stemmer which will use the provided Dictionary to create its stems. Uses the 
@@ -56,6 +58,7 @@ final class Stemmer {
    */
   public Stemmer(Dictionary dictionary, int recursionCap) {
     this.dictionary = dictionary;
+    this.affixReader = new ByteArrayDataInput(dictionary.affixData);
     this.recursionCap = recursionCap;
   } 
   
@@ -122,17 +125,20 @@ final class Stemmer {
     List<CharsRef> stems = new ArrayList<CharsRef>();
 
     for (int i = 0; i < length; i++) {
-      List<Affix> suffixes = dictionary.lookupSuffix(word, i, length - i);
+      List<Character> suffixes = dictionary.lookupSuffix(word, i, length - i);
       if (suffixes == null) {
         continue;
       }
 
-      for (Affix suffix : suffixes) {
-        if (hasCrossCheckedFlag(suffix.getFlag(), flags)) {
+      for (Character suffix : suffixes) {
+        affixReader.setPosition(8 * suffix);
+        char flag = (char) (affixReader.readShort() & 0xffff);
+        if (hasCrossCheckedFlag(flag, flags)) {
           int appendLength = length - i;
           int deAffixedLength = length - appendLength;
           // TODO: can we do this in-place?
-          dictionary.stripLookup.get(suffix.getStrip(), scratch);
+          char stripOrd = (char) (affixReader.readShort() & 0xffff);
+          dictionary.stripLookup.get(stripOrd, scratch);
           String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString();
 
           List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
@@ -143,17 +149,20 @@ final class Stemmer {
     }
 
     for (int i = length - 1; i >= 0; i--) {
-      List<Affix> prefixes = dictionary.lookupPrefix(word, 0, i);
+      List<Character> prefixes = dictionary.lookupPrefix(word, 0, i);
       if (prefixes == null) {
         continue;
       }
 
-      for (Affix prefix : prefixes) {
-        if (hasCrossCheckedFlag(prefix.getFlag(), flags)) {
+      for (Character prefix : prefixes) {
+        affixReader.setPosition(8 * prefix);
+        char flag = (char) (affixReader.readShort() & 0xffff);
+        if (hasCrossCheckedFlag(flag, flags)) {
           int deAffixedStart = i;
           int deAffixedLength = length - deAffixedStart;
+          char stripOrd = (char) (affixReader.readShort() & 0xffff);
 
-          dictionary.stripLookup.get(prefix.getStrip(), scratch);
+          dictionary.stripLookup.get(stripOrd, scratch);
           String strippedWord = new StringBuilder().append(scratch.utf8ToString())
               .append(word, deAffixedStart, deAffixedLength)
               .toString();
@@ -176,11 +185,19 @@ final class Stemmer {
    * @param recursionDepth Level of recursion this stemming step is at
    * @return List of stems for the word, or an empty list if none are found
    */
-  public List<CharsRef> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
+  public List<CharsRef> applyAffix(char strippedWord[], int length, char affix, int recursionDepth) {
     segment.setLength(0);
     segment.append(strippedWord, 0, length);
     
-    Pattern pattern = dictionary.patterns.get(affix.getCondition());
+    affixReader.setPosition(8 * affix);
+    char flag = (char) (affixReader.readShort() & 0xffff);
+    affixReader.skipBytes(2); // strip
+    int condition = (char) (affixReader.readShort() & 0xffff);
+    boolean crossProduct = (condition & 1) == 1;
+    condition >>>= 1;
+    char append = (char) (affixReader.readShort() & 0xffff);
+
+    Pattern pattern = dictionary.patterns.get(condition);
     if (!pattern.matcher(segment).matches()) {
       return Collections.emptyList();
     }
@@ -188,12 +205,12 @@ final class Stemmer {
     List<CharsRef> stems = new ArrayList<CharsRef>();
 
     char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch);
-    if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) {
+    if (wordFlags != null && Dictionary.hasFlag(wordFlags, flag)) {
       stems.add(new CharsRef(strippedWord, 0, length));
     }
 
-    if (affix.isCrossProduct() && recursionDepth < recursionCap) {
-      dictionary.flagLookup.get(affix.getAppendFlags(), scratch);
+    if (crossProduct && recursionDepth < recursionCap) {
+      dictionary.flagLookup.get(append, scratch);
       char appendFlags[] = Dictionary.decodeFlags(scratch);
       stems.addAll(stem(strippedWord, length, appendFlags, ++recursionDepth));
     }

From b2b86dd8add14c8e16d9b794707a3948834a6e68 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Thu, 27 Feb 2014 17:53:30 +0000
Subject: [PATCH 13/17] LUCENE-5468: convert affixes to FST

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572666 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene/analysis/hunspell2/Dictionary.java | 78 ++++++++++++++++---
 .../lucene/analysis/hunspell2/Stemmer.java    | 13 ++--
 .../hunspell2/TestAllDictionaries.java        |  3 +
 .../analysis/hunspell2/TestDictionary.java    |  8 +-
 4 files changed, 83 insertions(+), 19 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
index b30bdaa1e92..b9f9c82c2f5 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
@@ -31,7 +31,9 @@ import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.Version;
 import org.apache.lucene.util.fst.Builder;
 import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.IntSequenceOutputs;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
+import org.apache.lucene.util.fst.Util;
 
 import java.io.*;
 import java.nio.charset.Charset;
@@ -46,6 +48,7 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.TreeMap;
 import java.util.regex.Pattern;
 
 /**
@@ -68,8 +71,8 @@ public class Dictionary {
   private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
   private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
 
-  public CharArrayMap<List<Character>> prefixes;
-  public CharArrayMap<List<Character>> suffixes;
+  public FST<IntsRef> prefixes;
+  public FST<IntsRef> suffixes;
   
   // all Patterns used by prefixes and suffixes. these are typically re-used across
   // many affix stripping rules. so these are deduplicated, to save RAM.
@@ -137,7 +140,7 @@ public class Dictionary {
       ord = lookupOrd(word, offset, length);
     } catch (IOException ex) { /* bogus */ }
     if (ord == null) {
-      return null;
+      return null;  
     }
     return decodeFlags(flagLookup.get(ord, scratch));
   }
@@ -175,8 +178,8 @@ public class Dictionary {
    * @param length Length from the offset that the String is
    * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
    */
-  public List<Character> lookupPrefix(char word[], int offset, int length) {
-    return prefixes.get(word, offset, length);
+  IntsRef lookupPrefix(char word[], int offset, int length) {
+    return lookupAffix(prefixes, word, offset, length);
   }
 
   /**
@@ -187,8 +190,42 @@ public class Dictionary {
    * @param length Length from the offset that the String is
    * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
    */
-  List<Character> lookupSuffix(char word[], int offset, int length) {
-    return suffixes.get(word, offset, length);
+  IntsRef lookupSuffix(char word[], int offset, int length) {
+    return lookupAffix(suffixes, word, offset, length);
+  }
+  
+  // TODO: this is pretty stupid, considering how the stemming algorithm works
+  // we can speed it up to be significantly faster!
+  IntsRef lookupAffix(FST<IntsRef> fst, char word[], int offset, int length) {
+    if (fst == null) {
+      return null;
+    }
+    final FST.BytesReader bytesReader = fst.getBytesReader();
+    final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>());
+    // Accumulate output as we go
+    final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
+    IntsRef output = NO_OUTPUT;
+    
+    int l = offset + length;
+    try {
+      for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
+        cp = Character.codePointAt(word, i, l);
+        if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
+          return null;
+        } else if (arc.output != NO_OUTPUT) {
+          output = fst.outputs.add(output, arc.output);
+        }
+      }
+      if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
+        return null;
+      } else if (arc.output != NO_OUTPUT) {
+        return fst.outputs.add(output, arc.output);
+      } else {
+        return output;
+      }
+    } catch (IOException bogus) {
+      throw new RuntimeException(bogus);
+    }
   }
 
   /**
@@ -199,8 +236,8 @@ public class Dictionary {
    * @throws IOException Can be thrown while reading from the InputStream
    */
   private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
-    prefixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
-    suffixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
+    TreeMap<String, List<Character>> prefixes = new TreeMap<>();
+    TreeMap<String, List<Character>> suffixes = new TreeMap<>();
     Map<String,Integer> seenPatterns = new HashMap<>();
 
     LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
@@ -218,6 +255,27 @@ public class Dictionary {
         flagParsingStrategy = getFlagParsingStrategy(line);
       }
     }
+    
+    this.prefixes = affixFST(prefixes);
+    this.suffixes = affixFST(suffixes);
+  }
+  
+  private FST<IntsRef> affixFST(TreeMap<String,List<Character>> affixes) throws IOException {
+    IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
+    Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
+    
+    IntsRef scratch = new IntsRef();
+    for (Map.Entry<String,List<Character>> entry : affixes.entrySet()) {
+      Util.toUTF32(entry.getKey(), scratch);
+      List<Character> entries = entry.getValue();
+      IntsRef output = new IntsRef(entries.size());
+      int upto = 0;
+      for (Character c : entries) {
+        output.ints[output.length++] = c;
+      }
+      builder.add(scratch, output);
+    }
+    return builder.finish();
   }
 
   /**
@@ -231,7 +289,7 @@ public class Dictionary {
    * @param seenPatterns map from condition -> index of patterns, for deduplication.
    * @throws IOException Can be thrown while reading the rule
    */
-  private void parseAffix(CharArrayMap<List<Character>> affixes,
+  private void parseAffix(TreeMap<String,List<Character>> affixes,
                           String header,
                           LineNumberReader reader,
                           String conditionPattern,
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
index 4eaff6a9e95..d6b0133830a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
@@ -27,6 +27,7 @@ import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.Version;
 
 /**
@@ -125,12 +126,13 @@ final class Stemmer {
     List<CharsRef> stems = new ArrayList<CharsRef>();
 
     for (int i = 0; i < length; i++) {
-      List<Character> suffixes = dictionary.lookupSuffix(word, i, length - i);
+      IntsRef suffixes = dictionary.lookupSuffix(word, i, length - i);
       if (suffixes == null) {
         continue;
       }
 
-      for (Character suffix : suffixes) {
+      for (int j = 0; j < suffixes.length; j++) {
+        int suffix = suffixes.ints[suffixes.offset + j];
         affixReader.setPosition(8 * suffix);
         char flag = (char) (affixReader.readShort() & 0xffff);
         if (hasCrossCheckedFlag(flag, flags)) {
@@ -149,12 +151,13 @@ final class Stemmer {
     }
 
     for (int i = length - 1; i >= 0; i--) {
-      List<Character> prefixes = dictionary.lookupPrefix(word, 0, i);
+      IntsRef prefixes = dictionary.lookupPrefix(word, 0, i);
       if (prefixes == null) {
         continue;
       }
 
-      for (Character prefix : prefixes) {
+      for (int j = 0; j < prefixes.length; j++) {
+        int prefix = prefixes.ints[prefixes.offset + j];
         affixReader.setPosition(8 * prefix);
         char flag = (char) (affixReader.readShort() & 0xffff);
         if (hasCrossCheckedFlag(flag, flags)) {
@@ -185,7 +188,7 @@ final class Stemmer {
    * @param recursionDepth Level of recursion this stemming step is at
    * @return List of stems for the word, or an empty list if none are found
    */
-  public List<CharsRef> applyAffix(char strippedWord[], int length, char affix, int recursionDepth) {
+  public List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int recursionDepth) {
     segment.setLength(0);
     segment.append(strippedWord, 0, length);
     
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
index 9f9bce98236..d00fc634944 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
@@ -179,6 +179,9 @@ public class TestAllDictionaries extends LuceneTestCase {
           System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
                              "words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
                              "flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
+                             "strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " +
+                             "conditions=" + RamUsageEstimator.humanSizeOf(dic.patterns) + ", " +
+                             "affixData=" + RamUsageEstimator.humanSizeOf(dic.affixData) + ", " +
                              "prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +
                              "suffixes=" + RamUsageEstimator.humanSizeOf(dic.suffixes) + ")");
         }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java
index 14c6e8967d0..e8e0fd0d030 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java
@@ -32,8 +32,8 @@ public class TestDictionary extends LuceneTestCase {
     InputStream dictStream = getClass().getResourceAsStream("simple.dic");
 
     Dictionary dictionary = new Dictionary(affixStream, dictStream);
-    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
-    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
+    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length);
+    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length);
     char flags[] = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef());
     assertNotNull(flags);
     assertEquals(1, flags.length);
@@ -48,8 +48,8 @@ public class TestDictionary extends LuceneTestCase {
     InputStream dictStream = getClass().getResourceAsStream("compressed.dic");
 
     Dictionary dictionary = new Dictionary(affixStream, dictStream);
-    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
-    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
+    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length);
+    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length);
     assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef()).length);
     
     affixStream.close();

From c4f4beb27e6cb636b0b151b4288f2230e350adc4 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Thu, 27 Feb 2014 20:19:27 +0000
Subject: [PATCH 14/17] LUCENE-5468: hunspell2 -> hunspell (with previous
 options and tests)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572718 13f79535-47bb-0310-9956-ffa450edef68
---
 .../{hunspell2 => hunspell}/Dictionary.java   |  85 ++-
 .../analysis/hunspell/HunspellAffix.java      | 157 ------
 .../analysis/hunspell/HunspellDictionary.java | 507 ------------------
 .../analysis/hunspell/HunspellStemFilter.java |  89 ++-
 .../hunspell/HunspellStemFilterFactory.java   |  62 +--
 .../analysis/hunspell/HunspellStemmer.java    | 392 --------------
 .../analysis/hunspell/HunspellWord.java       |  63 ---
 .../ISO8859_14Decoder.java                    |   2 +-
 .../{hunspell2 => hunspell}/Stemmer.java      |  28 +-
 .../hunspell2/Hunspell2StemFilter.java        | 137 -----
 .../hunspell2/Hunspell2StemFilterFactory.java |  80 ---
 .../lucene/analysis/hunspell2/package.html    |  26 -
 ...he.lucene.analysis.util.TokenFilterFactory |   1 -
 .../analysis/core/TestRandomChains.java       |  12 +-
 .../hunspell/HunspellDictionaryTest.java      | 201 -------
 .../hunspell/HunspellStemFilterTest.java      |  92 ----
 .../hunspell/HunspellStemmerTest.java         | 137 -----
 .../TestAllDictionaries.java                  |  20 +-
 .../hunspell/TestCaseInsensitive.java         | 110 ++++
 .../TestDictionary.java                       |   3 +-
 .../TestHunspellStemFilter.java}              |  22 +-
 .../TestHunspellStemFilterFactory.java        |  11 +-
 .../{hunspell2 => hunspell}/TestStemmer.java  |   4 +-
 .../{hunspell2 => hunspell}/broken.aff        |   0
 .../{hunspell2 => hunspell}/compressed.aff    |   0
 .../{hunspell2 => hunspell}/compressed.dic    |   0
 .../lucene/analysis/hunspell/mixedcase.dic    |  10 +
 .../{hunspell2 => hunspell}/simple.aff        |   0
 .../{hunspell2 => hunspell}/simple.dic        |   0
 .../apache/lucene/analysis/hunspell/test.aff  |  20 -
 .../apache/lucene/analysis/hunspell/test.dic  |  10 -
 .../analysis/hunspell/testCompressed.aff      |  29 -
 .../analysis/hunspell/testCompressed.dic      |   9 -
 .../lucene/analysis/hunspell/testOverride.dic |   3 -
 .../analysis/hunspell/testWrongAffixRule.aff  |  24 -
 .../TestHunspell2StemFilterFactory.java       |  50 --
 36 files changed, 320 insertions(+), 2076 deletions(-)
 rename lucene/analysis/common/src/java/org/apache/lucene/analysis/{hunspell2 => hunspell}/Dictionary.java (90%)
 delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java
 delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
 delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
 delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java
 rename lucene/analysis/common/src/java/org/apache/lucene/analysis/{hunspell2 => hunspell}/ISO8859_14Decoder.java (98%)
 rename lucene/analysis/common/src/java/org/apache/lucene/analysis/{hunspell2 => hunspell}/Stemmer.java (92%)
 delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java
 delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java
 delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html
 delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
 delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java
 delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java
 rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/TestAllDictionaries.java (93%)
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java
 rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/TestDictionary.java (97%)
 rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2/TestHunspell2StemFilter.java => hunspell/TestHunspellStemFilter.java} (75%)
 rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/TestStemmer.java (95%)
 rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/broken.aff (100%)
 rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/compressed.aff (100%)
 rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/compressed.dic (100%)
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/mixedcase.dic
 rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/simple.aff (100%)
 rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/simple.dic (100%)
 delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff
 delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic
 delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff
 delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic
 delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testOverride.dic
 delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff
 delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
similarity index 90%
rename from lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index b9f9c82c2f5..7bbf27fb817 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis.hunspell2;
+package org.apache.lucene.analysis.hunspell;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -17,7 +17,6 @@ package org.apache.lucene.analysis.hunspell2;
  * limitations under the License.
  */
 
-import org.apache.lucene.analysis.util.CharArrayMap;
 import org.apache.lucene.store.ByteArrayDataOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
@@ -28,14 +27,19 @@ import org.apache.lucene.util.OfflineSorter;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
 import org.apache.lucene.util.UnicodeUtil;
-import org.apache.lucene.util.Version;
 import org.apache.lucene.util.fst.Builder;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.IntSequenceOutputs;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
 import org.apache.lucene.util.fst.Util;
 
-import java.io.*;
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.LineNumberReader;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
@@ -71,27 +75,27 @@ public class Dictionary {
   private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
   private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
 
-  public FST<IntsRef> prefixes;
-  public FST<IntsRef> suffixes;
+  FST<IntsRef> prefixes;
+  FST<IntsRef> suffixes;
   
   // all Patterns used by prefixes and suffixes. these are typically re-used across
   // many affix stripping rules. so these are deduplicated, to save RAM.
   // TODO: maybe don't use Pattern for the condition check...
   // TODO: when we cut over Affix to FST, just store integer index to this.
-  public ArrayList<Pattern> patterns = new ArrayList<>();
+  ArrayList<Pattern> patterns = new ArrayList<>();
   
   // the entries in the .dic file, mapping to their set of flags.
   // the fst output is the ordinal for flagLookup
-  public FST<Long> words;
+  FST<Long> words;
   // the list of unique flagsets (wordforms). theoretically huge, but practically
   // small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either.
-  public BytesRefHash flagLookup = new BytesRefHash();
+  BytesRefHash flagLookup = new BytesRefHash();
   
   // the list of unique strip affixes.
-  public BytesRefHash stripLookup = new BytesRefHash();
+  BytesRefHash stripLookup = new BytesRefHash();
   
   // 8 bytes per affix
-  public byte[] affixData = new byte[64];
+  byte[] affixData = new byte[64];
   private int currentAffix = 0;
 
   private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
@@ -100,7 +104,11 @@ public class Dictionary {
   private int aliasCount = 0;
   
   private final File tempDir = OfflineSorter.defaultTempDir(); // TODO: make this configurable?
-
+  
+  public static final int IGNORE_CASE = 1;
+  
+  boolean ignoreCase;
+  
   /**
    * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
    * and dictionary files.
@@ -112,6 +120,21 @@ public class Dictionary {
    * @throws ParseException Can be thrown if the content of the files does not meet expected formats
    */
   public Dictionary(InputStream affix, InputStream dictionary) throws IOException, ParseException {
+    this(affix, Collections.singletonList(dictionary), false);
+  }
+
+  /**
+   * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
+   * and dictionary files.
+   * You have to close the provided InputStreams yourself.
+   *
+   * @param affix InputStream for reading the hunspell affix file (won't be closed).
+   * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
+   * @throws IOException Can be thrown while reading from the InputStreams
+   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
+   */
+  public Dictionary(InputStream affix, List<InputStream> dictionaries, boolean ignoreCase) throws IOException, ParseException {
+    this.ignoreCase = ignoreCase;
     BufferedInputStream buffered = new BufferedInputStream(affix, 8192);
     buffered.mark(8192);
     String encoding = getDictionaryEncoding(affix);
@@ -122,7 +145,7 @@ public class Dictionary {
     stripLookup.add(new BytesRef()); // no strip -> ord 0
     PositiveIntOutputs o = PositiveIntOutputs.getSingleton();
     Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE4, o);
-    readDictionaryFile(dictionary, decoder, b);
+    readDictionaryFiles(dictionaries, decoder, b);
     words = b.finish();
   }
 
@@ -145,7 +168,7 @@ public class Dictionary {
     return decodeFlags(flagLookup.get(ord, scratch));
   }
   
-  public Integer lookupOrd(char word[], int offset, int length) throws IOException {
+  Integer lookupOrd(char word[], int offset, int length) throws IOException {
     final FST.BytesReader bytesReader = words.getBytesReader();
     final FST.Arc<Long> arc = words.getFirstArc(new FST.Arc<Long>());
     // Accumulate output as we go
@@ -269,7 +292,6 @@ public class Dictionary {
       Util.toUTF32(entry.getKey(), scratch);
       List<Character> entries = entry.getValue();
       IntsRef output = new IntsRef(entries.size());
-      int upto = 0;
       for (Character c : entries) {
         output.ints[output.length++] = c;
       }
@@ -480,23 +502,39 @@ public class Dictionary {
   }
 
   /**
-   * Reads the dictionary file through the provided InputStream, building up the words map
+   * Reads the dictionary file through the provided InputStreams, building up the words map
    *
-   * @param dictionary InputStream to read the dictionary file through
+   * @param dictionaries InputStreams to read the dictionary file through
    * @param decoder CharsetDecoder used to decode the contents of the file
    * @throws IOException Can be thrown while reading from the file
    */
-  private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder, Builder<Long> words) throws IOException {
+  private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<Long> words) throws IOException {
     BytesRef flagsScratch = new BytesRef();
     IntsRef scratchInts = new IntsRef();
     
-    BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
-    String line = lines.readLine(); // first line is number of entries (approximately, sometimes)
-    
     File unsorted = File.createTempFile("unsorted", "dat", tempDir);
     try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
-      while ((line = lines.readLine()) != null) {
-        writer.write(line.getBytes(IOUtils.CHARSET_UTF_8));
+      for (InputStream dictionary : dictionaries) {
+        BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
+        String line = lines.readLine(); // first line is number of entries (approximately, sometimes)
+        
+        while ((line = lines.readLine()) != null) {
+          if (ignoreCase) {
+            int flagSep = line.lastIndexOf('/');
+            if (flagSep == -1) {
+              writer.write(line.toLowerCase(Locale.ROOT).getBytes(IOUtils.CHARSET_UTF_8));
+            } else {
+              StringBuilder sb = new StringBuilder();
+              sb.append(line.substring(0, flagSep).toLowerCase(Locale.ROOT));
+              if (flagSep < line.length()) {
+                sb.append(line.substring(flagSep, line.length()));
+              }
+              writer.write(sb.toString().getBytes(IOUtils.CHARSET_UTF_8));
+            }
+          } else {
+            writer.write(line.getBytes(IOUtils.CHARSET_UTF_8));
+          }
+        }
       }
     }
     File sorted = File.createTempFile("sorted", "dat", tempDir);
@@ -544,6 +582,7 @@ public class Dictionary {
     BytesRef currentEntry = new BytesRef();
     char currentFlags[] = new char[0];
     
+    String line;
     while (reader.read(scratchLine)) {
       line = scratchLine.utf8ToString();
       String entry;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java
deleted file mode 100644
index 97376c0b15e..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java
+++ /dev/null
@@ -1,157 +0,0 @@
-package org.apache.lucene.analysis.hunspell;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.regex.Pattern;
-
-/**
- * Wrapper class representing a hunspell affix
- */
-public class HunspellAffix {
-
-  private String append; // the affix itself, what is appended
-  private char appendFlags[]; // continuation class flags
-  private String strip;
-  
-  private String condition;
-  private Pattern conditionPattern;
-  
-  private char flag;
-
-  private boolean crossProduct;
-
-  /**
-   * Checks whether the given text matches the conditional pattern on this affix
-   *
-   * @param text Text to check if it matches the affix's conditional pattern
-   * @return {@code true} if the text meets the condition, {@code false} otherwise
-   */
-  public boolean checkCondition(CharSequence text) {
-    return conditionPattern.matcher(text).matches();
-  }
-
-  /**
-   * Returns the append defined for the affix
-   *
-   * @return Defined append
-   */
-  public String getAppend() {
-    return append;
-  }
-
-  /**
-   * Sets the append defined for the affix
-   *
-   * @param append Defined append for the affix
-   */
-  public void setAppend(String append) {
-    this.append = append;
-  }
-
-  /**
-   * Returns the flags defined for the affix append
-   *
-   * @return Flags defined for the affix append
-   */
-  public char[] getAppendFlags() {
-    return appendFlags;
-  }
-
-  /**
-   * Sets the flags defined for the affix append
-   *
-   * @param appendFlags Flags defined for the affix append
-   */
-  public void setAppendFlags(char[] appendFlags) {
-    this.appendFlags = appendFlags;
-  }
-
-  /**
-   * Returns the stripping characters defined for the affix
-   *
-   * @return Stripping characters defined for the affix
-   */
-  public String getStrip() {
-    return strip;
-  }
-
-  /**
-   * Sets the stripping characters defined for the affix
-   *
-   * @param strip Stripping characters defined for the affix
-   */
-  public void setStrip(String strip) {
-    this.strip = strip;
-  }
-
-  /**
-   * Returns the condition that must be met before the affix can be applied
-   *
-   * @return Condition that must be met before the affix can be applied
-   */
-  public String getCondition() {
-    return condition;
-  }
-
-  /**
-   * Sets the condition that must be met before the affix can be applied
-   *
-   * @param condition Condition to be met before affix application
-   * @param pattern Condition as a regular expression pattern
-   */
-  public void setCondition(String condition, String pattern) {
-    this.condition = condition;
-    this.conditionPattern = Pattern.compile(pattern);
-  }
-
-  /**
-   * Returns the affix flag
-   *
-   * @return Affix flag
-   */
-  public char getFlag() {
-    return flag;
-  }
-
-  /**
-   * Sets the affix flag
-   *
-   * @param flag Affix flag
-   */
-  public void setFlag(char flag) {
-    this.flag = flag;
-  }
-
-  /**
-   * Returns whether the affix is defined as cross product
-   *
-   * @return {@code true} if the affix is cross product, {@code false} otherwise
-   */
-  public boolean isCrossProduct() {
-    return crossProduct;
-  }
-
-  /**
-   * Sets whether the affix is defined as cross product
-   *
-   * @param crossProduct Whether the affix is defined as cross product
-   */
-  public void setCrossProduct(boolean crossProduct) {
-    this.crossProduct = crossProduct;
-  }
-}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
deleted file mode 100644
index ccb53f57d29..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
+++ /dev/null
@@ -1,507 +0,0 @@
-package org.apache.lucene.analysis.hunspell;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.analysis.util.CharArrayMap;
-import org.apache.lucene.util.Version;
-
-import java.io.*;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.text.ParseException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Locale;
-
-/**
- * In-memory structure for the dictionary (.dic) and affix (.aff)
- * data of a hunspell dictionary.
- */
-public class HunspellDictionary {
-
-  static final HunspellWord NOFLAGS = new HunspellWord();
-  
-  private static final String ALIAS_KEY = "AF";
-  private static final String PREFIX_KEY = "PFX";
-  private static final String SUFFIX_KEY = "SFX";
-  private static final String FLAG_KEY = "FLAG";
-
-  private static final String NUM_FLAG_TYPE = "num";
-  private static final String UTF8_FLAG_TYPE = "UTF-8";
-  private static final String LONG_FLAG_TYPE = "long";
-  
-  private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
-  private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
-
-  private static final boolean IGNORE_CASE_DEFAULT = false;
-  private static final boolean STRICT_AFFIX_PARSING_DEFAULT = true;
-
-  private CharArrayMap<List<HunspellWord>> words;
-  private CharArrayMap<List<HunspellAffix>> prefixes;
-  private CharArrayMap<List<HunspellAffix>> suffixes;
-
-  private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
-  private boolean ignoreCase = IGNORE_CASE_DEFAULT;
-
-  private final Version version;
-
-  private String[] aliases;
-  private int aliasCount = 0;
-
-  /**
-   * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
-   * and dictionary files.
-   * You have to close the provided InputStreams yourself.
-   *
-   * @param affix InputStream for reading the hunspell affix file (won't be closed).
-   * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
-   * @param version Lucene Version
-   * @throws IOException Can be thrown while reading from the InputStreams
-   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
-   */
-  public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException {
-    this(affix, Arrays.asList(dictionary), version, IGNORE_CASE_DEFAULT);
-  }
-
-  /**
-   * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
-   * and dictionary files.
-   * You have to close the provided InputStreams yourself.
-   *
-   * @param affix InputStream for reading the hunspell affix file (won't be closed).
-   * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
-   * @param version Lucene Version
-   * @param ignoreCase If true, dictionary matching will be case insensitive
-   * @throws IOException Can be thrown while reading from the InputStreams
-   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
-   */
-  public HunspellDictionary(InputStream affix, InputStream dictionary, Version version, boolean ignoreCase) throws IOException, ParseException {
-    this(affix, Arrays.asList(dictionary), version, ignoreCase);
-  }
-
-  /**
-   * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
-   * and dictionary files.
-   * You have to close the provided InputStreams yourself.
-   *
-   * @param affix InputStream for reading the hunspell affix file (won't be closed).
-   * @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
-   * @param version Lucene Version
-   * @param ignoreCase If true, dictionary matching will be case insensitive
-   * @throws IOException Can be thrown while reading from the InputStreams
-   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
-   */
-  public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
-    this(affix, dictionaries, version, ignoreCase, STRICT_AFFIX_PARSING_DEFAULT);
-  }
-
-  /**
-   * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
-   * and dictionary files.
-   * You have to close the provided InputStreams yourself.
-   *
-   * @param affix InputStream for reading the hunspell affix file (won't be closed).
-   * @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
-   * @param version Lucene Version
-   * @param ignoreCase If true, dictionary matching will be case insensitive
-   * @param strictAffixParsing Affix strict parsing enabled or not (an error while reading a rule causes exception or is ignored)
-   * @throws IOException Can be thrown while reading from the InputStreams
-   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
-   */
-  public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase, boolean strictAffixParsing) throws IOException, ParseException {
-    this.version = version;
-    this.ignoreCase = ignoreCase;
-    String encoding = getDictionaryEncoding(affix);
-    CharsetDecoder decoder = getJavaEncoding(encoding);
-    readAffixFile(affix, decoder, strictAffixParsing);
-    words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, this.ignoreCase);
-    for (InputStream dictionary : dictionaries) {
-      readDictionaryFile(dictionary, decoder);
-    }
-  }
-
-  /**
-   * Looks up HunspellWords that match the String created from the given char array, offset and length
-   *
-   * @param word Char array to generate the String from
-   * @param offset Offset in the char array that the String starts at
-   * @param length Length from the offset that the String is
-   * @return List of HunspellWords that match the generated String, or {@code null} if none are found
-   */
-  public List<HunspellWord> lookupWord(char word[], int offset, int length) {
-    return words.get(word, offset, length);
-  }
-
-  /**
-   * Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length
-   *
-   * @param word Char array to generate the String from
-   * @param offset Offset in the char array that the String starts at
-   * @param length Length from the offset that the String is
-   * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
-   */
-  public List<HunspellAffix> lookupPrefix(char word[], int offset, int length) {
-    return prefixes.get(word, offset, length);
-  }
-
-  /**
-   * Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length
-   *
-   * @param word Char array to generate the String from
-   * @param offset Offset in the char array that the String starts at
-   * @param length Length from the offset that the String is
-   * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
-   */
-  public List<HunspellAffix> lookupSuffix(char word[], int offset, int length) {
-    return suffixes.get(word, offset, length);
-  }
-
-  /**
-   * Reads the affix file through the provided InputStream, building up the prefix and suffix maps
-   *
-   * @param affixStream InputStream to read the content of the affix file from
-   * @param decoder CharsetDecoder to decode the content of the file
-   * @throws IOException Can be thrown while reading from the InputStream
-   */
-  private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, boolean strict) throws IOException, ParseException {
-    prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
-    suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
-
-    LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
-    String line = null;
-    while ((line = reader.readLine()) != null) {
-      if (line.startsWith(ALIAS_KEY)) {
-        parseAlias(line);
-      } else if (line.startsWith(PREFIX_KEY)) {
-        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, strict);
-      } else if (line.startsWith(SUFFIX_KEY)) {
-        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, strict);
-      } else if (line.startsWith(FLAG_KEY)) {
-        // Assume that the FLAG line comes before any prefix or suffixes
-        // Store the strategy so it can be used when parsing the dic file
-        flagParsingStrategy = getFlagParsingStrategy(line);
-      }
-    }
-  }
-
-  /**
-   * Parses a specific affix rule putting the result into the provided affix map
-   * 
-   * @param affixes Map where the result of the parsing will be put
-   * @param header Header line of the affix rule
-   * @param reader BufferedReader to read the content of the rule from
-   * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
-   *                         pattern
-   * @throws IOException Can be thrown while reading the rule
-   */
-  private void parseAffix(CharArrayMap<List<HunspellAffix>> affixes,
-                          String header,
-                          LineNumberReader reader,
-                          String conditionPattern,
-                          boolean strict) throws IOException, ParseException {
-    String args[] = header.split("\\s+");
-
-    boolean crossProduct = args[2].equals("Y");
-    
-    int numLines = Integer.parseInt(args[3]);
-    for (int i = 0; i < numLines; i++) {
-      String line = reader.readLine();
-      String ruleArgs[] = line.split("\\s+");
-
-      if (ruleArgs.length < 5) {
-        if (strict) {
-          throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber());
-        }
-        continue;
-      }
-
-      HunspellAffix affix = new HunspellAffix();
-      
-      affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
-      affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]);
-
-      String affixArg = ruleArgs[3];
-      
-      int flagSep = affixArg.lastIndexOf('/');
-      if (flagSep != -1) {
-        String flagPart = affixArg.substring(flagSep + 1);
-        
-        if (aliasCount > 0) {
-          flagPart = getAliasValue(Integer.parseInt(flagPart));
-        } 
-        
-        char appendFlags[] = flagParsingStrategy.parseFlags(flagPart);
-        Arrays.sort(appendFlags);
-        affix.setAppendFlags(appendFlags);
-        affix.setAppend(affixArg.substring(0, flagSep));
-      } else {
-        affix.setAppend(affixArg);
-      }
-
-      String condition = ruleArgs[4];
-      affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition));
-      affix.setCrossProduct(crossProduct);
-      
-      List<HunspellAffix> list = affixes.get(affix.getAppend());
-      if (list == null) {
-        list = new ArrayList<HunspellAffix>();
-        affixes.put(affix.getAppend(), list);
-      }
-      
-      list.add(affix);
-    }
-  }
-
-  /**
-   * Parses the encoding specified in the affix file readable through the provided InputStream
-   *
-   * @param affix InputStream for reading the affix file
-   * @return Encoding specified in the affix file
-   * @throws IOException Can be thrown while reading from the InputStream
-   * @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET <encoding>}
-   */
-  private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
-    final StringBuilder encoding = new StringBuilder();
-    for (;;) {
-      encoding.setLength(0);
-      int ch;
-      while ((ch = affix.read()) >= 0) {
-        if (ch == '\n') {
-          break;
-        }
-        if (ch != '\r') {
-          encoding.append((char)ch);
-        }
-      }
-      if (
-          encoding.length() == 0 || encoding.charAt(0) == '#' ||
-          // this test only at the end as ineffective but would allow lines only containing spaces:
-          encoding.toString().trim().length() == 0
-      ) {
-        if (ch < 0) {
-          throw new ParseException("Unexpected end of affix file.", 0);
-        }
-        continue;
-      }
-      if ("SET ".equals(encoding.substring(0, 4))) {
-        // cleanup the encoding string, too (whitespace)
-        return encoding.substring(4).trim();
-      }
-      throw new ParseException("The first non-comment line in the affix file must "+
-          "be a 'SET charset', was: '" + encoding +"'", 0);
-    }
-  }
-
-  /**
-   * Retrieves the CharsetDecoder for the given encoding.  Note, This isn't perfect as I think ISCII-DEVANAGARI and
-   * MICROSOFT-CP1251 etc are allowed...
-   *
-   * @param encoding Encoding to retrieve the CharsetDecoder for
-   * @return CharSetDecoder for the given encoding
-   */
-  private CharsetDecoder getJavaEncoding(String encoding) {
-    Charset charset = Charset.forName(encoding);
-    return charset.newDecoder();
-  }
-
-  /**
-   * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file
-   *
-   * @param flagLine Line containing the flag information
-   * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition
-   */
-  private FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
-    String flagType = flagLine.substring(5);
-
-    if (NUM_FLAG_TYPE.equals(flagType)) {
-      return new NumFlagParsingStrategy();
-    } else if (UTF8_FLAG_TYPE.equals(flagType)) {
-      return new SimpleFlagParsingStrategy();
-    } else if (LONG_FLAG_TYPE.equals(flagType)) {
-      return new DoubleASCIIFlagParsingStrategy();
-    }
-
-    throw new IllegalArgumentException("Unknown flag type: " + flagType);
-  }
-
-  /**
-   * Reads the dictionary file through the provided InputStream, building up the words map
-   *
-   * @param dictionary InputStream to read the dictionary file through
-   * @param decoder CharsetDecoder used to decode the contents of the file
-   * @throws IOException Can be thrown while reading from the file
-   */
-  private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder) throws IOException {
-    BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder));
-    // TODO: don't create millions of strings.
-    String line = reader.readLine(); // first line is number of entries
-    int numEntries = Integer.parseInt(line);
-    
-    // TODO: the flags themselves can be double-chars (long) or also numeric
-    // either way the trick is to encode them as char... but they must be parsed differently
-    while ((line = reader.readLine()) != null) {
-      String entry;
-      HunspellWord wordForm;
-      
-      int flagSep = line.lastIndexOf('/');
-      if (flagSep == -1) {
-        wordForm = NOFLAGS;
-        entry = line;
-      } else {
-        // note, there can be comments (morph description) after a flag.
-        // we should really look for any whitespace
-        int end = line.indexOf('\t', flagSep);
-        if (end == -1)
-          end = line.length();
-        
-        String flagPart = line.substring(flagSep + 1, end);
-        if (aliasCount > 0) {
-          flagPart = getAliasValue(Integer.parseInt(flagPart));
-        } 
-        
-        wordForm = new HunspellWord(flagParsingStrategy.parseFlags(flagPart));
-        Arrays.sort(wordForm.getFlags());
-        entry = line.substring(0, flagSep);
-      }
-      if(ignoreCase) {
-        entry = entry.toLowerCase(Locale.ROOT);
-      }
-
-      List<HunspellWord> entries = new ArrayList<HunspellWord>();
-      entries.add(wordForm);
-      words.put(entry, entries);
-    }
-  }
-
-  public Version getVersion() {
-    return version;
-  }
-
-  private void parseAlias(String line) {
-    String ruleArgs[] = line.split("\\s+");
-    if (aliases == null) {
-      //first line should be the aliases count
-      final int count = Integer.parseInt(ruleArgs[1]);
-      aliases = new String[count];
-    } else {
-      aliases[aliasCount++] = ruleArgs[1];
-    }
-  }
-  
-  private String getAliasValue(int id) {
-    try {
-      return aliases[id - 1];
-    } catch (IndexOutOfBoundsException ex) {
-      throw new IllegalArgumentException("Bad flag alias number:" + id, ex);
-    }
-  }
-
-  /**
-   * Abstraction of the process of parsing flags taken from the affix and dic files
-   */
-  private static abstract class FlagParsingStrategy {
-
-    /**
-     * Parses the given String into a single flag
-     *
-     * @param rawFlag String to parse into a flag
-     * @return Parsed flag
-     */
-    char parseFlag(String rawFlag) {
-      return parseFlags(rawFlag)[0];
-    }
-
-    /**
-     * Parses the given String into multiple flags
-     *
-     * @param rawFlags String to parse into flags
-     * @return Parsed flags
-     */
-    abstract char[] parseFlags(String rawFlags);
-  }
-
-  /**
-   * Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags.
-   * Can be used with both the ASCII and UTF-8 flag types.
-   */
-  private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
-    /**
-     * {@inheritDoc}
-     */
-    @Override
-    public char[] parseFlags(String rawFlags) {
-      return rawFlags.toCharArray();
-    }
-  }
-
-  /**
-   * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form.  In the case
-   * of multiple flags, each number is separated by a comma.
-   */
-  private static class NumFlagParsingStrategy extends FlagParsingStrategy {
-    /**
-     * {@inheritDoc}
-     */
-    @Override
-    public char[] parseFlags(String rawFlags) {
-      String[] rawFlagParts = rawFlags.trim().split(",");
-      char[] flags = new char[rawFlagParts.length];
-
-      for (int i = 0; i < rawFlagParts.length; i++) {
-        // note, removing the trailing X/leading I for nepali... what is the rule here?! 
-        flags[i] = (char) Integer.parseInt(rawFlagParts[i].replaceAll("[^0-9]", ""));
-      }
-
-      return flags;
-    }
-  }
-
-  /**
-   * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes
-   * must be combined into a single character.
-   *
-   * TODO (rmuir) test
-   */
-  private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {
-
-    /**
-     * {@inheritDoc}
-     */
-    @Override
-    public char[] parseFlags(String rawFlags) {
-      if (rawFlags.length() == 0) {
-        return new char[0];
-      }
-
-      StringBuilder builder = new StringBuilder();
-      for (int i = 0; i < rawFlags.length(); i+=2) {
-        char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1));
-        builder.append(cookedFlag);
-      }
-      
-      char flags[] = new char[builder.length()];
-      builder.getChars(0, builder.length(), flags, 0);
-      return flags;
-    }
-  }
-
-  public boolean isIgnoreCase() {
-    return ignoreCase;
-  }
-}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java
index 4ff0a741ad8..a9b512b7bbd 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java
@@ -18,14 +18,16 @@ package org.apache.lucene.analysis.hunspell;
  */
 
 import java.io.IOException;
+import java.util.Collections;
+import java.util.Comparator;
 import java.util.List;
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.hunspell.HunspellStemmer.Stem;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.CharsRef;
 
 /**
  * TokenFilter that uses hunspell affix rules and words to stem tokens.  Since hunspell supports a word having multiple
@@ -41,71 +43,83 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  * {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory}
  * </p>
  *
- *
+ * @lucene.experimental
  */
 public final class HunspellStemFilter extends TokenFilter {
   
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
   private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
-  private final HunspellStemmer stemmer;
+  private final Stemmer stemmer;
   
-  private List<Stem> buffer;
+  private List<CharsRef> buffer;
   private State savedState;
   
   private final boolean dedup;
+  private final boolean longestOnly;
 
   /** Create a {@link HunspellStemFilter} which deduplicates stems and has a maximum
    *  recursion level of 2. 
-   *  @see #HunspellStemFilter(TokenStream, HunspellDictionary, int) */
-  public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) {
+   *  @see #HunspellStemFilter(TokenStream, Dictionary, int) */
+  public HunspellStemFilter(TokenStream input, Dictionary dictionary) {
     this(input, dictionary, 2);
   }
 
   /**
-   * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
-   * HunspellDictionary
+   * Creates a new Hunspell2StemFilter that will stem tokens from the given TokenStream using affix rules in the provided
+   * Dictionary
    *
    * @param input TokenStream whose tokens will be stemmed
    * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
    * @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
    */
-  public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, int recursionCap) {
+  public HunspellStemFilter(TokenStream input, Dictionary dictionary, int recursionCap) {
     this(input, dictionary, true, recursionCap);
   }
 
   /** Create a {@link HunspellStemFilter} which has a maximum recursion level of 2. 
-   *  @see #HunspellStemFilter(TokenStream, HunspellDictionary, boolean, int) */
-  public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) {
+   *  @see #HunspellStemFilter(TokenStream, Dictionary, boolean, int) */
+  public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup) {
     this(input, dictionary, dedup, 2);
   }
-
+  
   /**
    * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
-   * HunspellDictionary
+   * Dictionary
    *
    * @param input TokenStream whose tokens will be stemmed
    * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
    * @param dedup true if only unique terms should be output.
    * @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
    */
-  public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup, int recursionCap) {
-    super(input);
-    this.dedup = dedup;
-    this.stemmer = new HunspellStemmer(dictionary, recursionCap);
+  public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap) {
+    this(input, dictionary, dedup, recursionCap, false);
   }
 
   /**
-   * {@inheritDoc}
+   * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
+   * Dictionary
+   *
+   * @param input TokenStream whose tokens will be stemmed
+   * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
+   * @param dedup true if only unique terms should be output.
+   * @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
+   * @param longestOnly true if only the longest term should be output.
    */
+  public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap, boolean longestOnly) {
+    super(input);
+    this.dedup = dedup && longestOnly == false; // don't waste time deduping if longestOnly is set
+    this.stemmer = new Stemmer(dictionary, recursionCap);
+    this.longestOnly = longestOnly;
+  }
+
   @Override
   public boolean incrementToken() throws IOException {
     if (buffer != null && !buffer.isEmpty()) {
-      Stem nextStem = buffer.remove(0);
+      CharsRef nextStem = buffer.remove(0);
       restoreState(savedState);
       posIncAtt.setPositionIncrement(0);
-      termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength());
-      termAtt.setLength(nextStem.getStemLength());
+      termAtt.setEmpty().append(nextStem);
       return true;
     }
     
@@ -122,24 +136,41 @@ public final class HunspellStemFilter extends TokenFilter {
     if (buffer.isEmpty()) { // we do not know this word, return it unchanged
       return true;
     }     
+    
+    if (longestOnly && buffer.size() > 1) {
+      Collections.sort(buffer, lengthComparator);
+    }
 
-    Stem stem = buffer.remove(0);
-    termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength());
-    termAtt.setLength(stem.getStemLength());
+    CharsRef stem = buffer.remove(0);
+    termAtt.setEmpty().append(stem);
 
-    if (!buffer.isEmpty()) {
-      savedState = captureState();
+    if (longestOnly) {
+      buffer.clear();
+    } else {
+      if (!buffer.isEmpty()) {
+        savedState = captureState();
+      }
     }
 
     return true;
   }
 
-  /**
-   * {@inheritDoc}
-   */
   @Override
   public void reset() throws IOException {
     super.reset();
     buffer = null;
   }
+  
+  static final Comparator<CharsRef> lengthComparator = new Comparator<CharsRef>() {
+    @Override
+    public int compare(CharsRef o1, CharsRef o2) {
+      int cmp = Integer.compare(o2.length, o1.length);
+      if (cmp == 0) {
+        // tie break on text
+        return o2.compareTo(o1);
+      } else {
+        return cmp;
+      }
+    }
+  };
 }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java
index 63e621c2ab9..e632b489d51 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java
@@ -31,89 +31,75 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
 import org.apache.lucene.util.IOUtils;
 
 /**
- * TokenFilterFactory that creates instances of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}.
- * Example config for British English including a custom dictionary, case insensitive matching:
+ * TokenFilterFactory that creates instances of {@link HunspellStemFilter}.
+ * Example config for British English:
  * <pre class="prettyprint">
  * &lt;filter class=&quot;solr.HunspellStemFilterFactory&quot;
- *    dictionary=&quot;en_GB.dic,my_custom.dic&quot;
- *    affix=&quot;en_GB.aff&quot;
- *    ignoreCase=&quot;true&quot; /&gt;</pre>
+ *         dictionary=&quot;en_GB.dic,my_custom.dic&quot;
+ *         affix=&quot;en_GB.aff&quot; 
+ *         ignoreCase=&quot;false&quot;
+ *         longestOnly=&quot;false&quot; /&gt;</pre>
  * Both parameters dictionary and affix are mandatory.
- * <br/>
- * The parameter ignoreCase (true/false) controls whether matching is case sensitive or not. Default false.
- * <br/>
- * The parameter strictAffixParsing (true/false) controls whether the affix parsing is strict or not. Default true.
- * If strict an error while reading an affix rule causes a ParseException, otherwise is ignored.
- * <br/>
  * Dictionaries for many languages are available through the OpenOffice project.
  * 
  * See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
+ * @lucene.experimental
  */
 public class HunspellStemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
-  private static final String PARAM_DICTIONARY = "dictionary";
-  private static final String PARAM_AFFIX = "affix";
-  private static final String PARAM_IGNORE_CASE = "ignoreCase";
-  private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing";
+  private static final String PARAM_DICTIONARY    = "dictionary";
+  private static final String PARAM_AFFIX         = "affix";
   private static final String PARAM_RECURSION_CAP = "recursionCap";
+  private static final String PARAM_IGNORE_CASE   = "ignoreCase";
+  private static final String PARAM_LONGEST_ONLY  = "longestOnly";
 
-  private final String dictionaryArg;
+  private final String dictionaryFiles;
   private final String affixFile;
   private final boolean ignoreCase;
-  private final boolean strictAffixParsing;
-  private HunspellDictionary dictionary;
+  private final boolean longestOnly;
+  private Dictionary dictionary;
   private int recursionCap;
   
   /** Creates a new HunspellStemFilterFactory */
   public HunspellStemFilterFactory(Map<String,String> args) {
     super(args);
-    assureMatchVersion();
-    dictionaryArg = require(args, PARAM_DICTIONARY);
+    dictionaryFiles = require(args, PARAM_DICTIONARY);
     affixFile = get(args, PARAM_AFFIX);
     ignoreCase = getBoolean(args, PARAM_IGNORE_CASE, false);
-    strictAffixParsing = getBoolean(args, PARAM_STRICT_AFFIX_PARSING, true);
     recursionCap = getInt(args, PARAM_RECURSION_CAP, 2);
+    longestOnly = getBoolean(args, PARAM_LONGEST_ONLY, false);
+    // this isnt necessary: we properly load all dictionaries.
+    // but recognize and ignore for back compat
+    getBoolean(args, "strictAffixParsing", true);
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
   }
 
-  /**
-   * Loads the hunspell dictionary and affix files defined in the configuration
-   *  
-   * @param loader ResourceLoader used to load the files
-   */
   @Override
   public void inform(ResourceLoader loader) throws IOException {
-    String dictionaryFiles[] = dictionaryArg.split(",");
+    String dicts[] = dictionaryFiles.split(",");
 
     InputStream affix = null;
     List<InputStream> dictionaries = new ArrayList<InputStream>();
 
     try {
       dictionaries = new ArrayList<InputStream>();
-      for (String file : dictionaryFiles) {
+      for (String file : dicts) {
         dictionaries.add(loader.openResource(file));
       }
       affix = loader.openResource(affixFile);
 
-      this.dictionary = new HunspellDictionary(affix, dictionaries, luceneMatchVersion, ignoreCase, strictAffixParsing);
+      this.dictionary = new Dictionary(affix, dictionaries, ignoreCase);
     } catch (ParseException e) {
-      throw new IOException("Unable to load hunspell data! [dictionary=" + dictionaryArg + ",affix=" + affixFile + "]", e);
+      throw new IOException("Unable to load hunspell data! [dictionary=" + dictionaries + ",affix=" + affixFile + "]", e);
     } finally {
       IOUtils.closeWhileHandlingException(affix);
       IOUtils.closeWhileHandlingException(dictionaries);
     }
   }
 
-  /**
-   * Creates an instance of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter} that will filter the given
-   * TokenStream
-   *
-   * @param tokenStream TokenStream that will be filtered
-   * @return HunspellStemFilter that filters the TokenStream 
-   */
   @Override
   public TokenStream create(TokenStream tokenStream) {
-    return new HunspellStemFilter(tokenStream, dictionary, recursionCap);
+    return new HunspellStemFilter(tokenStream, dictionary, true, recursionCap, longestOnly);
   }
 }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
deleted file mode 100644
index ae2948284d6..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
+++ /dev/null
@@ -1,392 +0,0 @@
-package org.apache.lucene.analysis.hunspell;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.CharacterUtils;
-import org.apache.lucene.util.Version;
-
-/**
- * HunspellStemmer uses the affix rules declared in the HunspellDictionary to generate one or more stems for a word.  It
- * conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
- */
-public class HunspellStemmer {
-  private final int recursionCap;
-  private final HunspellDictionary dictionary;
-  private final StringBuilder segment = new StringBuilder();
-  private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_CURRENT);
-
-  /**
-   * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems. Uses the 
-   * default recursion cap of <code>2</code> (based on Hunspell documentation). 
-   *
-   * @param dictionary HunspellDictionary that will be used to create the stems
-   */
-  public HunspellStemmer(HunspellDictionary dictionary) {
-    this(dictionary, 2);
-  }
-
-  /**
-   * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
-   *
-   * @param dictionary HunspellDictionary that will be used to create the stems
-   * @param recursionCap maximum level of recursion stemmer can go into
-   */
-  public HunspellStemmer(HunspellDictionary dictionary, int recursionCap) {
-    this.dictionary = dictionary;
-    this.recursionCap = recursionCap;
-  } 
-  
-  /**
-   * Find the stem(s) of the provided word
-   * 
-   * @param word Word to find the stems for
-   * @return List of stems for the word
-   */
-  public List<Stem> stem(String word) {
-    return stem(word.toCharArray(), word.length());
-  }
-
-  /**
-   * Find the stem(s) of the provided word
-   * 
-   * @param word Word to find the stems for
-   * @return List of stems for the word
-   */
-  public List<Stem> stem(char word[], int length) {
-    List<Stem> stems = new ArrayList<Stem>();
-    if (dictionary.lookupWord(word, 0, length) != null) {
-      stems.add(new Stem(word, length));
-    }
-    stems.addAll(stem(word, length, null, 0));
-    return stems;
-  }
-  
-  /**
-   * Find the unique stem(s) of the provided word
-   * 
-   * @param word Word to find the stems for
-   * @return List of stems for the word
-   */
-  public List<Stem> uniqueStems(char word[], int length) {
-    List<Stem> stems = new ArrayList<Stem>();
-    CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, dictionary.isIgnoreCase());
-    if (dictionary.lookupWord(word, 0, length) != null) {
-      stems.add(new Stem(word, length));
-      terms.add(word);
-    }
-    List<Stem> otherStems = stem(word, length, null, 0);
-    for (Stem s : otherStems) {
-      if (!terms.contains(s.stem)) {
-        stems.add(s);
-        terms.add(s.stem);
-      }
-    }
-    return stems;
-  }
-
-  // ================================================= Helper Methods ================================================
-
-  /**
-   * Generates a list of stems for the provided word
-   *
-   * @param word Word to generate the stems for
-   * @param flags Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step
-   * @param recursionDepth Level of recursion this stemming step is at
-   * @return List of stems, pr an empty if no stems are found
-   */
-  private List<Stem> stem(char word[], int length, char[] flags, int recursionDepth) {
-    List<Stem> stems = new ArrayList<Stem>();
-
-    for (int i = 0; i < length; i++) {
-      List<HunspellAffix> suffixes = dictionary.lookupSuffix(word, i, length - i);
-      if (suffixes == null) {
-        continue;
-      }
-
-      for (HunspellAffix suffix : suffixes) {
-        if (hasCrossCheckedFlag(suffix.getFlag(), flags)) {
-          int deAffixedLength = length - suffix.getAppend().length();
-          // TODO: can we do this in-place?
-          String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();
-
-          List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
-          for (Stem stem : stemList) {
-            stem.addSuffix(suffix);
-          }
-
-          stems.addAll(stemList);
-        }
-      }
-    }
-
-    for (int i = length - 1; i >= 0; i--) {
-      List<HunspellAffix> prefixes = dictionary.lookupPrefix(word, 0, i);
-      if (prefixes == null) {
-        continue;
-      }
-
-      for (HunspellAffix prefix : prefixes) {
-        if (hasCrossCheckedFlag(prefix.getFlag(), flags)) {
-          int deAffixedStart = prefix.getAppend().length();
-          int deAffixedLength = length - deAffixedStart;
-
-          String strippedWord = new StringBuilder().append(prefix.getStrip())
-              .append(word, deAffixedStart, deAffixedLength)
-              .toString();
-
-          List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
-          for (Stem stem : stemList) {
-            stem.addPrefix(prefix);
-          }
-
-          stems.addAll(stemList);
-        }
-      }
-    }
-
-    return stems;
-  }
-
-  /**
-   * Applies the affix rule to the given word, producing a list of stems if any are found
-   *
-   * @param strippedWord Word the affix has been removed and the strip added
-   * @param affix HunspellAffix representing the affix rule itself
-   * @param recursionDepth Level of recursion this stemming step is at
-   * @return List of stems for the word, or an empty list if none are found
-   */
-  @SuppressWarnings("unchecked")
-  public List<Stem> applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) {
-    if(dictionary.isIgnoreCase()) {
-      charUtils.toLowerCase(strippedWord, 0, strippedWord.length);
-    }
-    segment.setLength(0);
-    segment.append(strippedWord, 0, length);
-    if (!affix.checkCondition(segment)) {
-      return Collections.EMPTY_LIST;
-    }
-
-    List<Stem> stems = new ArrayList<Stem>();
-
-    List<HunspellWord> words = dictionary.lookupWord(strippedWord, 0, length);
-    if (words != null) {
-      for (HunspellWord hunspellWord : words) {
-        if (hunspellWord.hasFlag(affix.getFlag())) {
-          stems.add(new Stem(strippedWord, length));
-        }
-      }
-    }
-
-    if (affix.isCrossProduct() && recursionDepth < recursionCap) {
-      stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth));
-    }
-
-    return stems;
-  }
-
-  /**
-   * Checks if the given flag cross checks with the given array of flags
-   *
-   * @param flag Flag to cross check with the array of flags
-   * @param flags Array of flags to cross check against.  Can be {@code null}
-   * @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise
-   */
-  private boolean hasCrossCheckedFlag(char flag, char[] flags) {
-    return flags == null || Arrays.binarySearch(flags, flag) >= 0;
-  }
-
-  /**
-   * Stem represents all information known about a stem of a word.  This includes the stem, and the prefixes and suffixes
-   * that were used to change the word into the stem.
-   */
-  public static class Stem {
-
-    private final List<HunspellAffix> prefixes = new ArrayList<HunspellAffix>();
-    private final List<HunspellAffix> suffixes = new ArrayList<HunspellAffix>();
-    private final char stem[];
-    private final int stemLength;
-
-    /**
-     * Creates a new Stem wrapping the given word stem
-     *
-     * @param stem Stem of a word
-     */
-    public Stem(char stem[], int stemLength) {
-      this.stem = stem;
-      this.stemLength = stemLength;
-    }
-
-    /**
-     * Adds a prefix to the list of prefixes used to generate this stem.  Because it is assumed that prefixes are added
-     * depth first, the prefix is added to the front of the list
-     *
-     * @param prefix Prefix to add to the list of prefixes for this stem
-     */
-    public void addPrefix(HunspellAffix prefix) {
-      prefixes.add(0, prefix);
-    }
-
-    /**
-     * Adds a suffix to the list of suffixes used to generate this stem.  Because it is assumed that suffixes are added
-     * depth first, the suffix is added to the end of the list
-     *
-     * @param suffix Suffix to add to the list of suffixes for this stem
-     */
-    public void addSuffix(HunspellAffix suffix) {
-      suffixes.add(suffix);
-    }
-
-    /**
-     * Returns the list of prefixes used to generate the stem
-     *
-     * @return List of prefixes used to generate the stem or an empty list if no prefixes were required
-     */
-    public List<HunspellAffix> getPrefixes() {
-      return prefixes;
-    }
-
-    /**
-     * Returns the list of suffixes used to generate the stem
-     *
-     * @return List of suffixes used to generate the stem or an empty list if no suffixes were required
-     */
-    public List<HunspellAffix> getSuffixes() {
-      return suffixes;
-    }
-
-    /**
-     * Returns the actual word stem itself
-     *
-     * @return Word stem itself
-     */
-    public char[] getStem() {
-      return stem;
-    }
-
-    /**
-     * @return the stemLength
-     */
-    public int getStemLength() {
-      return stemLength;
-    }
-    
-    public String getStemString() {
-      return new String(stem, 0, stemLength);
-    }
-    
-  }
-
-
-  // ================================================= Entry Point ===================================================
-
-  /*
-   * HunspellStemmer entry point.  Accepts two arguments: location of affix file and location of dic file
-   *
-   * @param args Program arguments.  Should contain location of affix file and location of dic file
-   * @throws IOException Can be thrown while reading from the files
-   * @throws ParseException Can be thrown while parsing the files
-  public static void main(String[] args) throws IOException, ParseException {
-    boolean ignoreCase = false;
-    int offset = 0;
-    
-    if (args.length < 2) {
-      System.out.println("usage: HunspellStemmer [-i] <affix location> <dic location>");
-      System.exit(1);
-    }
-
-    if(args[offset].equals("-i")) {
-      ignoreCase = true;
-      System.out.println("Ignoring case. All stems will be returned lowercased");
-      offset++;
-    }
-    
-    InputStream affixInputStream = new FileInputStream(args[offset++]);
-    InputStream dicInputStream = new FileInputStream(args[offset++]);
-
-    // :Post-Release-Update-Version.LUCENE_XY:
-    HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_50, ignoreCase);
-
-    affixInputStream.close();
-    dicInputStream.close();
-    
-    HunspellStemmer stemmer = new HunspellStemmer(dictionary);
-
-    Scanner scanner = new Scanner(System.in, Charset.defaultCharset().name());
-    
-    System.out.print("> ");
-    while (scanner.hasNextLine()) {
-      String word = scanner.nextLine();
-      
-      if ("exit".equals(word)) {
-        break;
-      }
-
-      printStemResults(word, stemmer.stem(word.toCharArray(), word.length()));
-      
-      System.out.print("> ");
-    }
-  }
-
-   * Prints the results of the stemming of a word
-   *
-   * @param originalWord Word that has been stemmed
-   * @param stems Stems of the word
-  private static void printStemResults(String originalWord, List<Stem> stems) {
-    StringBuilder builder = new StringBuilder().append("stem(").append(originalWord).append(")").append("\n");
-
-    for (Stem stem : stems) {
-      builder.append("- ").append(stem.getStem()).append(": ");
-
-      for (HunspellAffix prefix : stem.getPrefixes()) {
-        builder.append(prefix.getAppend()).append("+");
-
-        if (hasText(prefix.getStrip())) {
-          builder.append(prefix.getStrip()).append("-");
-        }
-      }
-
-      builder.append(stem.getStem());
-
-      for (HunspellAffix suffix : stem.getSuffixes()) {
-        if (hasText(suffix.getStrip())) {
-          builder.append("-").append(suffix.getStrip());
-        }
-        
-        builder.append("+").append(suffix.getAppend());
-      }
-      builder.append("\n");
-    }
-
-    System.out.println(builder);
-  }
-
-   * Simple utility to check if the given String has any text
-   *
-   * @param str String to check if it has any text
-   * @return {@code true} if the String has text, {@code false} otherwise
-  private static boolean hasText(String str) {
-    return str != null && str.length() > 0;
-  }
-  */
-}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java
deleted file mode 100644
index fe216d30dc8..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java
+++ /dev/null
@@ -1,63 +0,0 @@
-package org.apache.lucene.analysis.hunspell;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.Arrays;
-
-/**
- * A dictionary (.dic) entry with its associated flags.
- */
-public class HunspellWord {
-  
-  private final char flags[]; // sorted, can we represent more concisely?
-
-  /**
-   * Creates a new HunspellWord with no associated flags
-   */
-  public HunspellWord() {
-    flags = null;
-  }
-
-  /**
-   * Constructs a new HunspellWord with the given flags
-   *
-   * @param flags Flags to associate with the word
-   */
-  public HunspellWord(char[] flags) {
-    this.flags = flags;
-  }
-
-  /**
-   * Checks whether the word has the given flag associated with it
-   *
-   * @param flag Flag to check whether it is associated with the word
-   * @return {@code true} if the flag is associated, {@code false} otherwise
-   */
-  public boolean hasFlag(char flag) {
-    return flags != null && Arrays.binarySearch(flags, flag) >= 0;
-  }
-
-  /**
-   * Returns the flags associated with the word
-   *
-   * @return Flags associated with the word
-   */
-  public char[] getFlags() {
-    return flags;
-  }
-}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ISO8859_14Decoder.java
similarity index 98%
rename from lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java
rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ISO8859_14Decoder.java
index 4de0d4bc051..2d87947ab3d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ISO8859_14Decoder.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis.hunspell2;
+package org.apache.lucene.analysis.hunspell;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
similarity index 92%
rename from lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
index d6b0133830a..18e6588ce7a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis.hunspell2;
+package org.apache.lucene.analysis.hunspell;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -24,6 +24,7 @@ import java.util.List;
 import java.util.regex.Pattern;
 
 import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.CharacterUtils;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
@@ -37,9 +38,10 @@ import org.apache.lucene.util.Version;
 final class Stemmer {
   private final int recursionCap;
   private final Dictionary dictionary;
-  private BytesRef scratch = new BytesRef();
+  private final BytesRef scratch = new BytesRef();
   private final StringBuilder segment = new StringBuilder();
   private final ByteArrayDataInput affixReader;
+  private final CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_CURRENT);
 
   /**
    * Constructs a new Stemmer which will use the provided Dictionary to create its stems. Uses the 
@@ -80,6 +82,9 @@ final class Stemmer {
    * @return List of stems for the word
    */
   public List<CharsRef> stem(char word[], int length) {
+    if (dictionary.ignoreCase) {
+      charUtils.toLowerCase(word, 0, length);
+    }
     List<CharsRef> stems = new ArrayList<CharsRef>();
     if (dictionary.lookupWord(word, 0, length, scratch) != null) {
       stems.add(new CharsRef(word, 0, length));
@@ -95,20 +100,19 @@ final class Stemmer {
    * @return List of stems for the word
    */
   public List<CharsRef> uniqueStems(char word[], int length) {
-    List<CharsRef> stems = new ArrayList<CharsRef>();
-    CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false);
-    if (dictionary.lookupWord(word, 0, length, scratch) != null) {
-      stems.add(new CharsRef(word, 0, length));
-      terms.add(word);
+    List<CharsRef> stems = stem(word, length);
+    if (stems.size() < 2) {
+      return stems;
     }
-    List<CharsRef> otherStems = stem(word, length, Dictionary.NOFLAGS, 0);
-    for (CharsRef s : otherStems) {
+    CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, dictionary.ignoreCase);
+    List<CharsRef> deduped = new ArrayList<>();
+    for (CharsRef s : stems) {
       if (!terms.contains(s)) {
-        stems.add(s);
+        deduped.add(s);
         terms.add(s);
       }
     }
-    return stems;
+    return deduped;
   }
 
   // ================================================= Helper Methods ================================================
@@ -188,7 +192,7 @@ final class Stemmer {
    * @param recursionDepth Level of recursion this stemming step is at
    * @return List of stems for the word, or an empty list if none are found
    */
-  public List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int recursionDepth) {
+  List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int recursionDepth) {
     segment.setLength(0);
     segment.append(strippedWord, 0, length);
     
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java
deleted file mode 100644
index 00ff88469be..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java
+++ /dev/null
@@ -1,137 +0,0 @@
-package org.apache.lucene.analysis.hunspell2;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.List;
-
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.util.CharsRef;
-
-/**
- * TokenFilter that uses hunspell affix rules and words to stem tokens.  Since hunspell supports a word having multiple
- * stems, this filter can emit multiple tokens for each consumed token
- *
- * <p>
- * Note: This filter is aware of the {@link KeywordAttribute}. To prevent
- * certain terms from being passed to the stemmer
- * {@link KeywordAttribute#isKeyword()} should be set to <code>true</code>
- * in a previous {@link TokenStream}.
- *
- * Note: For including the original term as well as the stemmed version, see
- * {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory}
- * </p>
- *
- * @lucene.experimental
- */
-public final class Hunspell2StemFilter extends TokenFilter {
-  
-  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
-  private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
-  private final Stemmer stemmer;
-  
-  private List<CharsRef> buffer;
-  private State savedState;
-  
-  private final boolean dedup;
-
-  /** Create a {@link Hunspell2StemFilter} which deduplicates stems and has a maximum
-   *  recursion level of 2. 
-   *  @see #Hunspell2StemFilter(TokenStream, Dictionary, int) */
-  public Hunspell2StemFilter(TokenStream input, Dictionary dictionary) {
-    this(input, dictionary, 2);
-  }
-
-  /**
-   * Creates a new Hunspell2StemFilter that will stem tokens from the given TokenStream using affix rules in the provided
-   * Dictionary
-   *
-   * @param input TokenStream whose tokens will be stemmed
-   * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
-   * @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
-   */
-  public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, int recursionCap) {
-    this(input, dictionary, true, recursionCap);
-  }
-
-  /** Create a {@link Hunspell2StemFilter} which has a maximum recursion level of 2. 
-   *  @see #Hunspell2StemFilter(TokenStream, Dictionary, boolean, int) */
-  public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup) {
-    this(input, dictionary, dedup, 2);
-  }
-
-  /**
-   * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
-   * Dictionary
-   *
-   * @param input TokenStream whose tokens will be stemmed
-   * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
-   * @param dedup true if only unique terms should be output.
-   * @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
-   */
-  public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap) {
-    super(input);
-    this.dedup = dedup;
-    this.stemmer = new Stemmer(dictionary, recursionCap);
-  }
-
-  @Override
-  public boolean incrementToken() throws IOException {
-    if (buffer != null && !buffer.isEmpty()) {
-      CharsRef nextStem = buffer.remove(0);
-      restoreState(savedState);
-      posIncAtt.setPositionIncrement(0);
-      termAtt.setEmpty().append(nextStem);
-      return true;
-    }
-    
-    if (!input.incrementToken()) {
-      return false;
-    }
-    
-    if (keywordAtt.isKeyword()) {
-      return true;
-    }
-    
-    buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length());
-
-    if (buffer.isEmpty()) { // we do not know this word, return it unchanged
-      return true;
-    }     
-
-    CharsRef stem = buffer.remove(0);
-    termAtt.setEmpty().append(stem);
-
-    if (!buffer.isEmpty()) {
-      savedState = captureState();
-    }
-
-    return true;
-  }
-
-  @Override
-  public void reset() throws IOException {
-    super.reset();
-    buffer = null;
-  }
-}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java
deleted file mode 100644
index 6ce73698dfd..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java
+++ /dev/null
@@ -1,80 +0,0 @@
-package org.apache.lucene.analysis.hunspell2;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.text.ParseException;
-import java.util.Map;
-
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.ResourceLoader;
-import org.apache.lucene.analysis.util.ResourceLoaderAware;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
-
-/**
- * TokenFilterFactory that creates instances of {@link Hunspell2StemFilter}.
- * Example config for British English:
- * <pre class="prettyprint">
- * &lt;filter class=&quot;solr.Hunspell2StemFilterFactory&quot;
- *         dictionary=&quot;en_GB.dic&quot;
- *         affix=&quot;en_GB.aff&quot; /&gt;</pre>
- * Both parameters dictionary and affix are mandatory.
- * Dictionaries for many languages are available through the OpenOffice project.
- * 
- * See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
- * @lucene.experimental
- */
-public class Hunspell2StemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
-  private static final String PARAM_DICTIONARY    = "dictionary";
-  private static final String PARAM_AFFIX         = "affix";
-  private static final String PARAM_RECURSION_CAP = "recursionCap";
-
-  private final String dictionaryFile;
-  private final String affixFile;
-  private Dictionary dictionary;
-  private int recursionCap;
-  
-  /** Creates a new Hunspell2StemFilterFactory */
-  public Hunspell2StemFilterFactory(Map<String,String> args) {
-    super(args);
-    dictionaryFile = require(args, PARAM_DICTIONARY);
-    affixFile = get(args, PARAM_AFFIX);
-    recursionCap = getInt(args, PARAM_RECURSION_CAP, 2);
-    if (!args.isEmpty()) {
-      throw new IllegalArgumentException("Unknown parameters: " + args);
-    }
-  }
-
-  @Override
-  public void inform(ResourceLoader loader) throws IOException {
-    try (InputStream affix = loader.openResource(affixFile);
-        InputStream dictionary = loader.openResource(dictionaryFile)) {
-      try {
-        this.dictionary = new Dictionary(affix, dictionary);
-      } catch (ParseException e) {
-        throw new RuntimeException(e);
-      }
-    }
-  }
-
-  @Override
-  public TokenStream create(TokenStream tokenStream) {
-    return new Hunspell2StemFilter(tokenStream, dictionary, recursionCap);
-  }
-}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html
deleted file mode 100644
index 196591969e8..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html
+++ /dev/null
@@ -1,26 +0,0 @@
-<!--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-  -->
-<html>
-<body>
-Stemming TokenFilter using a Java implementation of the <a href="http://www.ldc.upenn.edu/Catalog/docs/LDC2008T01/acta04.pdf">
-Hunspell stemming algorithm.</a>
-<p>
-Dictionaries can be found on <a href="http://wiki.services.openoffice.org/wiki/Dictionaries">
-OpenOffice's wiki</a>
-</p>
-</body>
-</html>
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
index e4ca7c6802c..04fc80cf59c 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@@ -51,7 +51,6 @@ org.apache.lucene.analysis.hi.HindiNormalizationFilterFactory
 org.apache.lucene.analysis.hi.HindiStemFilterFactory
 org.apache.lucene.analysis.hu.HungarianLightStemFilterFactory
 org.apache.lucene.analysis.hunspell.HunspellStemFilterFactory
-org.apache.lucene.analysis.hunspell2.Hunspell2StemFilterFactory
 org.apache.lucene.analysis.id.IndonesianStemFilterFactory
 org.apache.lucene.analysis.in.IndicNormalizationFilterFactory
 org.apache.lucene.analysis.it.ItalianLightStemFilterFactory
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index bca5e1ede50..617e7523b69 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -62,8 +62,8 @@ import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
 import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
 import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter;
 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
-import org.apache.lucene.analysis.hunspell.HunspellDictionary;
-import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest;
+import org.apache.lucene.analysis.hunspell.Dictionary;
+import org.apache.lucene.analysis.hunspell.TestHunspellStemFilter;
 import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter;
 import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
 import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter;
@@ -406,13 +406,13 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         return new IdentityEncoder(); // the other encoders will throw exceptions if tokens arent numbers?
       }
     });
-    put(HunspellDictionary.class, new ArgProducer() {
+    put(Dictionary.class, new ArgProducer() {
       @Override public Object create(Random random) {
         // TODO: make nastier
-        InputStream affixStream = HunspellDictionaryTest.class.getResourceAsStream("test.aff");
-        InputStream dictStream = HunspellDictionaryTest.class.getResourceAsStream("test.dic");
+        InputStream affixStream = TestHunspellStemFilter.class.getResourceAsStream("simple.aff");
+        InputStream dictStream = TestHunspellStemFilter.class.getResourceAsStream("simple.dic");
         try {
-         return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
+         return new Dictionary(affixStream, dictStream);
         } catch (Exception ex) {
           Rethrow.rethrow(ex);
           return null; // unreachable code
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
deleted file mode 100644
index fd8f9211727..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
+++ /dev/null
@@ -1,201 +0,0 @@
-package org.apache.lucene.analysis.hunspell;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.text.ParseException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.lucene.util.LuceneTestCase;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class HunspellDictionaryTest extends LuceneTestCase {
-  
-  private class CloseCheckInputStream extends InputStream {
-    private InputStream delegate;
-    
-    private boolean closed = false;
-
-    public CloseCheckInputStream(InputStream delegate) {
-      super();
-      this.delegate = delegate;
-    }
-
-    @Override
-    public int read() throws IOException {
-      return delegate.read();
-    }
-
-    @Override
-    public int hashCode() {
-      return delegate.hashCode();
-    }
-
-    @Override
-    public int read(byte[] b) throws IOException {
-      return delegate.read(b);
-    }
-
-    @Override
-    public boolean equals(Object obj) {
-      return delegate.equals(obj);
-    }
-
-    @Override
-    public int read(byte[] b, int off, int len) throws IOException {
-      return delegate.read(b, off, len);
-    }
-
-    @Override
-    public long skip(long n) throws IOException {
-      return delegate.skip(n);
-    }
-
-    @Override
-    public String toString() {
-      return delegate.toString();
-    }
-
-    @Override
-    public int available() throws IOException {
-      return delegate.available();
-    }
-
-    @Override
-    public void close() throws IOException {
-      this.closed = true;
-      delegate.close();
-    }
-
-    @Override
-    public void mark(int readlimit) {
-      delegate.mark(readlimit);
-    }
-
-    @Override
-    public void reset() throws IOException {
-      delegate.reset();
-    }
-
-    @Override
-    public boolean markSupported() {
-      return delegate.markSupported();
-    }
-    
-    public boolean isClosed() {
-      return this.closed;
-    }
-    
-  }
-
-  @Test
-  public void testResourceCleanup() throws IOException, ParseException {
-    CloseCheckInputStream affixStream = new CloseCheckInputStream(getClass().getResourceAsStream("testCompressed.aff"));
-    CloseCheckInputStream dictStream = new CloseCheckInputStream(getClass().getResourceAsStream("testCompressed.dic"));
-    
-    new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
-    
-    assertFalse(affixStream.isClosed());
-    assertFalse(dictStream.isClosed());
-    
-    affixStream.close();
-    dictStream.close();
-    
-    assertTrue(affixStream.isClosed());
-    assertTrue(dictStream.isClosed());
-  }
-
-  @Test
-  public void testHunspellDictionary_loadDicAff() throws IOException, ParseException {
-    InputStream affixStream = getClass().getResourceAsStream("test.aff");
-    InputStream dictStream = getClass().getResourceAsStream("test.dic");
-
-    HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
-    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
-    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
-    assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
-    assertEquals("Wrong number of flags for lucen", 1, dictionary.lookupWord(new char[]{'l', 'u', 'c', 'e', 'n'}, 0, 5).get(0).getFlags().length);
-
-    affixStream.close();
-    dictStream.close();
-  }
-
-  @Test
-  public void testHunspellDictionary_multipleDictWithOverride() throws IOException, ParseException {
-    InputStream affixStream = getClass().getResourceAsStream("test.aff");
-    List<InputStream> dictStreams = new ArrayList<InputStream>();
-    dictStreams.add(getClass().getResourceAsStream("test.dic"));
-    dictStreams.add(getClass().getResourceAsStream("testOverride.dic"));
-
-    HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStreams, TEST_VERSION_CURRENT, false);
-    assertEquals("Wrong number of flags for lucen", 3, dictionary.lookupWord(new char[]{'l', 'u', 'c', 'e', 'n'}, 0, 5).get(0).getFlags().length);
-    assertEquals("Wrong number of flags for bar", 1, dictionary.lookupWord(new char[]{'b', 'a', 'r'}, 0, 3).get(0).getFlags().length);
-
-    affixStream.close();
-    for(InputStream dstream : dictStreams) {
-      dstream.close();
-    }
-  }
-
-  @Test
-  public void testCompressedHunspellDictionary_loadDicAff() throws IOException, ParseException {
-    InputStream affixStream = getClass().getResourceAsStream("testCompressed.aff");
-    InputStream dictStream = getClass().getResourceAsStream("testCompressed.dic");
-
-    HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
-    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
-    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
-    assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
-    
-    affixStream.close();
-    dictStream.close();
-  }
-
-  @Test
-  public void testHunspellDictionary_loadDicWrongAff() throws IOException, ParseException {
-    InputStream affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff");
-    InputStream dictStream = getClass().getResourceAsStream("test.dic");
-
-    HunspellDictionary dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, false);
-    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
-    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
-    assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
-    //strict parsing disabled: malformed rule is not loaded
-    assertNull(dictionary.lookupPrefix(new char[]{'a'}, 0, 1));    
-    affixStream.close();
-    dictStream.close();
-
-    affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff");
-    dictStream = getClass().getResourceAsStream("test.dic");
-    //strict parsing enabled: malformed rule causes ParseException
-    try {
-      dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, true);
-      Assert.fail();
-    } catch(ParseException e) {
-      Assert.assertEquals("The affix file contains a rule with less than five elements", e.getMessage());
-      Assert.assertEquals(23, e.getErrorOffset());
-    }
-    
-    affixStream.close();
-    dictStream.close();
-  }
-}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java
deleted file mode 100644
index dd273fa8dc5..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java
+++ /dev/null
@@ -1,92 +0,0 @@
-package org.apache.lucene.analysis.hunspell;
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.text.ParseException;
-import java.util.Arrays;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.KeywordTokenizer;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.util.TestUtil;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
-
-public class HunspellStemFilterTest  extends BaseTokenStreamTestCase {
-  
-  private static HunspellDictionary DICTIONARY;
-  @BeforeClass
-  public static void beforeClass() throws IOException, ParseException {
-    DICTIONARY = createDict(true);
-  }
-  @AfterClass
-  public static void afterClass() {
-    DICTIONARY = null;
-  }
-  public static HunspellDictionary createDict(boolean ignoreCase) throws IOException, ParseException {
-    InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
-    InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
-
-    return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT, ignoreCase);
-  }
-  
-  /**
-   * Simple test for KeywordAttribute
-   */
-  public void testKeywordAttribute() throws IOException {
-    MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome");
-    tokenizer.setEnableChecks(true);
-    HunspellStemFilter filter = new HunspellStemFilter(tokenizer, DICTIONARY, TestUtil.nextInt(random(), 1, 3));
-    assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1});
-    
-    // assert with keywork marker
-    tokenizer = whitespaceMockTokenizer("lucene is awesome");
-    CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true);
-    filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), DICTIONARY, TestUtil.nextInt(random(), 1, 3));
-    assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
-  }
-  
-  /** blast some random strings through the analyzer */
-  public void testRandomStrings() throws Exception {
-    Analyzer analyzer = new Analyzer() {
-
-      @Override
-      protected TokenStreamComponents createComponents(String fieldName) {
-        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
-        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, TestUtil.nextInt(random(), 1, 3)));
-      }  
-    };
-    checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
-  }
-  
-  public void testEmptyTerm() throws IOException {
-    Analyzer a = new Analyzer() {
-      @Override
-      protected TokenStreamComponents createComponents(String fieldName) {
-        Tokenizer tokenizer = new KeywordTokenizer();
-        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, TestUtil.nextInt(random(), 1, 3)));
-      }
-    };
-    checkOneTerm(a, "", "");
-  }
-}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java
deleted file mode 100644
index 66a9410c27a..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java
+++ /dev/null
@@ -1,137 +0,0 @@
-package org.apache.lucene.analysis.hunspell;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.Version;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.text.ParseException;
-import java.util.List;
-
-import static junit.framework.Assert.assertEquals;
-
-public class HunspellStemmerTest extends LuceneTestCase {
-
-  private static HunspellStemmer stemmer;
-
-  @BeforeClass
-  public static void beforeClass() throws IOException, ParseException {
-    createStemmer(true);
-  }
-  
-  @AfterClass
-  public static void afterClass() {
-    stemmer = null;
-  }
-
-  @Test
-  public void testStem_simpleSuffix() {
-    List<HunspellStemmer.Stem> stems = stemmer.stem("lucene");
-
-    assertEquals(2, stems.size());
-    assertEquals("lucene", stems.get(0).getStemString());
-    assertEquals("lucen", stems.get(1).getStemString());
-
-    stems = stemmer.stem("mahoute");
-    assertEquals(1, stems.size());
-    assertEquals("mahout", stems.get(0).getStemString());
-  }
-
-  @Test
-  public void testStem_simplePrefix() {
-    List<HunspellStemmer.Stem> stems = stemmer.stem("solr");
-
-    assertEquals(1, stems.size());
-    assertEquals("olr", stems.get(0).getStemString());
-  }
-
-  @Test
-  public void testStem_recursiveSuffix() {
-    List<HunspellStemmer.Stem> stems = stemmer.stem("abcd");
-
-    assertEquals(1, stems.size());
-    assertEquals("ab", stems.get(0).getStemString());
-  }
-
-  @Test
-  public void testStem_ignoreCase() throws IOException, ParseException {
-    List<HunspellStemmer.Stem> stems;
-    createStemmer(true);
-
-    stems = stemmer.stem("apache");
-    assertEquals(1, stems.size());
-    assertEquals("apach", stems.get(0).getStemString());
-
-    stems = stemmer.stem("APACHE");
-    assertEquals(1, stems.size());
-    assertEquals("apach", stems.get(0).getStemString());
-
-    stems = stemmer.stem("Apache");
-    assertEquals(1, stems.size());
-    assertEquals("apach", stems.get(0).getStemString());
-    
-    stems = stemmer.stem("foos");
-    assertEquals(1, stems.size());
-    assertEquals("foo", stems.get(0).getStemString());
-    
-    stems = stemmer.stem("mood");
-    assertEquals(1, stems.size());
-    assertEquals("moo", stems.get(0).getStemString());
-    
-    stems = stemmer.stem("Foos");
-    assertEquals(1, stems.size());
-    assertEquals("foo", stems.get(0).getStemString());
-
-    // The "Foo" rule gets overridden by the "foo" rule, and we don't merge
-    stems = stemmer.stem("Food");
-    assertEquals(0, stems.size());
-
-    stems = stemmer.stem("Mood");
-    assertEquals(1, stems.size());
-    assertEquals("moo", stems.get(0).getStemString());
-  }
-
-  @Test
-  public void testStem_caseSensitive() throws IOException, ParseException {
-    createStemmer(false);
-    List<HunspellStemmer.Stem> stems = stemmer.stem("apache");
-    assertEquals(0, stems.size());
-
-    stems = stemmer.stem("Apache");
-    assertEquals(1, stems.size());
-    assertEquals("Apach", stems.get(0).getStemString());
-  }
-
-  
-  private static void createStemmer(boolean ignoreCase) throws IOException, ParseException {
-    InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
-    InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
-
-    HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT, ignoreCase);
-    stemmer = new HunspellStemmer(dictionary);
-
-    affixStream.close();
-    dictStream.close();
-  }
-
-}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
similarity index 93%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
index d00fc634944..3322eb109a6 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis.hunspell2;
+package org.apache.lucene.analysis.hunspell;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -22,7 +22,7 @@ import java.io.InputStream;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipFile;
 
-import org.apache.lucene.analysis.hunspell.HunspellDictionary;
+import org.apache.lucene.analysis.hunspell.Dictionary;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.RamUsageEstimator;
@@ -33,7 +33,7 @@ import org.junit.Ignore;
  * wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/
  * Note some of the files differ only in case. This may be a problem on your operating system!
  */
-//@Ignore("enable manually")
+@Ignore("enable manually")
 public class TestAllDictionaries extends LuceneTestCase {
   
   // set this to the location of where you downloaded all the files
@@ -162,21 +162,11 @@ public class TestAllDictionaries extends LuceneTestCase {
         assert dicEntry != null;
         ZipEntry affEntry = zip.getEntry(tests[i+2]);
         assert affEntry != null;
-        
-        // get ram from previous impl
-        String oldRAM = "FAIL";
-        try (InputStream dictionary = zip.getInputStream(dicEntry);
-            InputStream affix = zip.getInputStream(affEntry)) {
-          try {
-            HunspellDictionary dic = new HunspellDictionary(affix, dictionary, TEST_VERSION_CURRENT);
-            oldRAM = RamUsageEstimator.humanSizeOf(dic);
-          } catch (Throwable t) {}
-       }
       
         try (InputStream dictionary = zip.getInputStream(dicEntry);
              InputStream affix = zip.getInputStream(affEntry)) {
           Dictionary dic = new Dictionary(affix, dictionary);
-          System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
+          System.out.println(tests[i] + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
                              "words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
                              "flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
                              "strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " +
@@ -204,7 +194,7 @@ public class TestAllDictionaries extends LuceneTestCase {
         
           try (InputStream dictionary = zip.getInputStream(dicEntry);
                InputStream affix = zip.getInputStream(affEntry)) {
-              Dictionary dic = new Dictionary(affix, dictionary);
+              new Dictionary(affix, dictionary);
           }
         }
       }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java
new file mode 100644
index 00000000000..64bdb41e8c7
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java
@@ -0,0 +1,110 @@
+package org.apache.lucene.analysis.hunspell;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.hunspell.Dictionary;
+import org.apache.lucene.analysis.hunspell.Stemmer;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+public class TestCaseInsensitive extends LuceneTestCase {
+  private static Stemmer stemmer;
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    try (InputStream affixStream = TestCaseInsensitive.class.getResourceAsStream("simple.aff");
+        InputStream dictStream = TestCaseInsensitive.class.getResourceAsStream("mixedcase.dic")) {
+     Dictionary dictionary = new Dictionary(affixStream, Collections.singletonList(dictStream), true);
+     stemmer = new Stemmer(dictionary);
+   }
+  }
+  
+  @AfterClass
+  public static void afterClass() {
+    stemmer = null;
+  }
+
+  public void testCaseInsensitivity() {
+    assertStemsTo("lucene", "lucene", "lucen");
+    assertStemsTo("LuCeNe", "lucene", "lucen");
+    assertStemsTo("mahoute", "mahout");
+    assertStemsTo("MaHoUte", "mahout");
+  }
+
+  public void testSimplePrefix() {
+    assertStemsTo("solr", "olr");
+  }
+
+  public void testRecursiveSuffix() {
+    assertStemsTo("abcd", "ab");
+  }
+
+  // all forms unmunched from dictionary
+  public void testAllStems() {
+    assertStemsTo("ab", "ab");
+    assertStemsTo("abc", "ab");
+    assertStemsTo("apach", "apach");
+    assertStemsTo("apache", "apach");
+    assertStemsTo("foo", "foo");
+    assertStemsTo("food", "foo");
+    assertStemsTo("foos", "foo");
+    assertStemsTo("lucen", "lucen");
+    assertStemsTo("lucene", "lucen", "lucene");
+    assertStemsTo("mahout", "mahout");
+    assertStemsTo("mahoute", "mahout");
+    assertStemsTo("moo", "moo");
+    assertStemsTo("mood", "moo");
+    assertStemsTo("olr", "olr");
+    assertStemsTo("solr", "olr");
+  }
+  
+  // some bogus stuff that should not stem (empty lists)!
+  public void testBogusStems() {    
+    assertStemsTo("abs");
+    assertStemsTo("abe");
+    assertStemsTo("sab");
+    assertStemsTo("sapach");
+    assertStemsTo("sapache");
+    assertStemsTo("apachee");
+    assertStemsTo("sfoo");
+    assertStemsTo("sfoos");
+    assertStemsTo("fooss");
+    assertStemsTo("lucenee");
+    assertStemsTo("solre");
+  }
+  
+  private void assertStemsTo(String s, String... expected) {
+    Arrays.sort(expected);
+    
+    List<CharsRef> stems = stemmer.stem(s);
+    String actual[] = new String[stems.size()];
+    for (int i = 0; i < actual.length; i++) {
+      actual[i] = stems.get(i).toString();
+    }
+    Arrays.sort(actual);
+    
+    assertArrayEquals(expected, actual);
+  }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
similarity index 97%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java
rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
index e8e0fd0d030..6cbe931d376 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis.hunspell2;
+package org.apache.lucene.analysis.hunspell;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -22,6 +22,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.text.ParseException;
 
+import org.apache.lucene.analysis.hunspell.Dictionary;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java
similarity index 75%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java
rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java
index eafb1f272cf..af48427d522 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis.hunspell2;
+package org.apache.lucene.analysis.hunspell;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -26,13 +26,15 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.hunspell.Dictionary;
+import org.apache.lucene.analysis.hunspell.HunspellStemFilter;
 import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.util.TestUtil;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 
-public class TestHunspell2StemFilter extends BaseTokenStreamTestCase {
+public class TestHunspellStemFilter extends BaseTokenStreamTestCase {
   private static Dictionary dictionary;
   
   @BeforeClass
@@ -52,13 +54,21 @@ public class TestHunspell2StemFilter extends BaseTokenStreamTestCase {
   public void testKeywordAttribute() throws IOException {
     MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome");
     tokenizer.setEnableChecks(true);
-    Hunspell2StemFilter filter = new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3));
+    HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3));
     assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1});
     
     // assert with keyword marker
     tokenizer = whitespaceMockTokenizer("lucene is awesome");
     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true);
-    filter = new Hunspell2StemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary, TestUtil.nextInt(random(), 1, 3));
+    filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary, TestUtil.nextInt(random(), 1, 3));
+    assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
+  }
+  
+  /** simple test for longestOnly option */
+  public void testLongestOnly() throws IOException {
+    MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome");
+    tokenizer.setEnableChecks(true);
+    HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, true, TestUtil.nextInt(random(), 1, 3), true);
     assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
   }
   
@@ -68,7 +78,7 @@ public class TestHunspell2StemFilter extends BaseTokenStreamTestCase {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
-        return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
+        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
       }  
     };
     checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
@@ -79,7 +89,7 @@ public class TestHunspell2StemFilter extends BaseTokenStreamTestCase {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         Tokenizer tokenizer = new KeywordTokenizer();
-        return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
+        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
       }
     };
     checkOneTerm(a, "", "");
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java
index e8e232ce60b..f4302035dbc 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java
@@ -20,7 +20,6 @@ package org.apache.lucene.analysis.hunspell;
 import java.io.Reader;
 import java.io.StringReader;
 
-import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
 
@@ -31,17 +30,17 @@ public class TestHunspellStemFilterFactory extends BaseTokenStreamFactoryTestCas
   public void testStemming() throws Exception {
     Reader reader = new StringReader("abc");
     TokenStream stream = whitespaceMockTokenizer(reader);
-    stream = tokenFilterFactory("HunspellStem",
-        "dictionary", "test.dic",
-        "affix", "test.aff").create(stream);
+    stream = tokenFilterFactory("Hunspell2Stem",
+        "dictionary", "simple.dic",
+        "affix", "simple.aff").create(stream);
     assertTokenStreamContents(stream, new String[] { "ab" });
   }
   
   /** Test that bogus arguments result in exception */
   public void testBogusArguments() throws Exception {
     try {
-      tokenFilterFactory("HunspellStem",
-          "dictionary", "test.dic",
+      tokenFilterFactory("Hunspell2Stem",
+          "dictionary", "simple.dic",
           "bogusArg", "bogusValue");
       fail();
     } catch (IllegalArgumentException expected) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java
similarity index 95%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java
rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java
index 4dec107f314..dca9faa6b16 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis.hunspell2;
+package org.apache.lucene.analysis.hunspell;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -17,6 +17,8 @@ package org.apache.lucene.analysis.hunspell2;
  * limitations under the License.
  */
 
+import org.apache.lucene.analysis.hunspell.Dictionary;
+import org.apache.lucene.analysis.hunspell.Stemmer;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.junit.AfterClass;
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/broken.aff
similarity index 100%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff
rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/broken.aff
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff
similarity index 100%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff
rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.dic
similarity index 100%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic
rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.dic
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/mixedcase.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/mixedcase.dic
new file mode 100644
index 00000000000..9fae253279e
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/mixedcase.dic
@@ -0,0 +1,10 @@
+9
+Ab/C
+apach/A
+Foo/D
+foo/E
+Lucen/A
+Lucene
+mahout/A
+Moo/E
+olr/B
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.aff
similarity index 100%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff
rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.aff
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.dic
similarity index 100%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic
rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.dic
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff
deleted file mode 100644
index db9423dcad1..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff
+++ /dev/null
@@ -1,20 +0,0 @@
-SET UTF-8
-TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
-
-SFX A Y 3
-SFX A   0     e         n
-SFX A   0     e         t
-SFX A   0     e         h
-
-SFX C Y 2
-SFX C   0     d/C       c
-SFX C   0     c         b
-
-SFX D Y 1
-SFX D   0     s         o
-
-SFX E Y 1
-SFX E   0     d         o
-
-PFX B Y 1
-PFX B   0     s         o
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic
deleted file mode 100644
index 12efd8fccb2..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic
+++ /dev/null
@@ -1,10 +0,0 @@
-9
-lucen/A
-lucene
-mahout/A
-olr/B
-ab/C
-Apach/A
-Foo/E
-foo/D
-Moo/E
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff
deleted file mode 100644
index e4a1b37300f..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff
+++ /dev/null
@@ -1,29 +0,0 @@
-SET UTF-8
-TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
-
-FLAG long
-
-AF 5
-AF AA
-AF BB
-AF CC
-AF DD
-AF EE
-
-SFX AA Y 3
-SFX AA   0     e         n
-SFX AA   0     e         t
-SFX AA   0     e         h
-
-SFX CC Y 2
-SFX CC   0     d/3       c
-SFX CC   0     c         b
-
-SFX DD Y 1
-SFX DD   0     s         o
-
-SFX EE Y 1
-SFX EE   0     d         o
-
-PFX BB Y 1
-PFX BB   0     s         o
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic
deleted file mode 100644
index bf237662017..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic
+++ /dev/null
@@ -1,9 +0,0 @@
-6
-lucen/1
-lucene
-mahout/1
-olr/2
-ab/3
-Apach/1
-foo/4
-Foo/5
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testOverride.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testOverride.dic
deleted file mode 100644
index c1111ef562b..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testOverride.dic
+++ /dev/null
@@ -1,3 +0,0 @@
-2
-lucen/ABC
-bar/A
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff
deleted file mode 100644
index 3b780cd1d7b..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff
+++ /dev/null
@@ -1,24 +0,0 @@
-SET UTF-8
-TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
-
-SFX A Y 3
-SFX A   0     e         n
-SFX A   0     e         t
-SFX A   0     e         h
-
-SFX C Y 2
-SFX C   0     d/C       c
-SFX C   0     c         b
-
-SFX D Y 1
-SFX D   0     s         o
-
-SFX E Y 1
-SFX E   0     d         o
-
-PFX B Y 1
-PFX B   0     s         o
-
-#wrong rule (only 4 elements)
-PFX A0 Y 1
-PFX A0 0 a
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java
deleted file mode 100644
index d95e2be04b6..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java
+++ /dev/null
@@ -1,50 +0,0 @@
-package org.apache.lucene.analysis.hunspell2;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.Reader;
-import java.io.StringReader;
-
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
-
-/**
- * Simple tests to ensure the Hunspell stemmer loads from factory
- */
-public class TestHunspell2StemFilterFactory extends BaseTokenStreamFactoryTestCase {
-  public void testStemming() throws Exception {
-    Reader reader = new StringReader("abc");
-    TokenStream stream = whitespaceMockTokenizer(reader);
-    stream = tokenFilterFactory("Hunspell2Stem",
-        "dictionary", "simple.dic",
-        "affix", "simple.aff").create(stream);
-    assertTokenStreamContents(stream, new String[] { "ab" });
-  }
-  
-  /** Test that bogus arguments result in exception */
-  public void testBogusArguments() throws Exception {
-    try {
-      tokenFilterFactory("Hunspell2Stem",
-          "dictionary", "simple.dic",
-          "bogusArg", "bogusValue");
-      fail();
-    } catch (IllegalArgumentException expected) {
-      assertTrue(expected.getMessage().contains("Unknown parameters"));
-    }
-  }
-}

From a51e85f91f3a189bf6b4a7806856d915d31285f3 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Thu, 27 Feb 2014 20:19:50 +0000
Subject: [PATCH 15/17] fix oops

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572719 13f79535-47bb-0310-9956-ffa450edef68
---
 .../org/apache/lucene/analysis/hunspell/HunspellStemFilter.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java
index a9b512b7bbd..87de53aee63 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java
@@ -66,7 +66,7 @@ public final class HunspellStemFilter extends TokenFilter {
   }
 
   /**
-   * Creates a new Hunspell2StemFilter that will stem tokens from the given TokenStream using affix rules in the provided
+   * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
    * Dictionary
    *
    * @param input TokenStream whose tokens will be stemmed

From 66ccdead470caa1daba3e9f619fec7ceb87b699a Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Thu, 27 Feb 2014 20:39:35 +0000
Subject: [PATCH 16/17] LUCENE-5468: fix precommit+test

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572724 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/CHANGES.txt                                          | 6 ++++++
 .../analysis/hunspell/TestHunspellStemFilterFactory.java    | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index ebe7bb12a8a..5b9245fcaf1 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -82,6 +82,12 @@ API Changes
 * LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues
   that supports random access to the ordinals in a document. (Robert Muir)
 
+Optimizations
+
+* LUCENE-5468: HunspellStemFilter uses 10 to 100x less RAM. It also loads
+  all known openoffice dictionaries without error, and supports an additional 
+  longestOnly option for a less aggressive approach.  (Robert Muir)
+
 Bug fixes
 
 * LUCENE-5450: Fix getField() NPE issues with SpanOr/SpanNear when they have an 
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java
index f4302035dbc..b671f6dbdbf 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java
@@ -30,7 +30,7 @@ public class TestHunspellStemFilterFactory extends BaseTokenStreamFactoryTestCas
   public void testStemming() throws Exception {
     Reader reader = new StringReader("abc");
     TokenStream stream = whitespaceMockTokenizer(reader);
-    stream = tokenFilterFactory("Hunspell2Stem",
+    stream = tokenFilterFactory("HunspellStem",
         "dictionary", "simple.dic",
         "affix", "simple.aff").create(stream);
     assertTokenStreamContents(stream, new String[] { "ab" });
@@ -39,7 +39,7 @@ public class TestHunspellStemFilterFactory extends BaseTokenStreamFactoryTestCas
   /** Test that bogus arguments result in exception */
   public void testBogusArguments() throws Exception {
     try {
-      tokenFilterFactory("Hunspell2Stem",
+      tokenFilterFactory("HunspellStem",
           "dictionary", "simple.dic",
           "bogusArg", "bogusValue");
       fail();

From 0c5f1c42a8bbc744c519d25fee7481b77d474a49 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Thu, 27 Feb 2014 20:42:52 +0000
Subject: [PATCH 17/17] LUCENE-5468: add additional change

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572727 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/CHANGES.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 5b9245fcaf1..e9679b25b05 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -82,6 +82,8 @@ API Changes
 * LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues
   that supports random access to the ordinals in a document. (Robert Muir)
 
+* LUCENE-5468: Move offline Sort (from suggest module) to OfflineSort. (Robert Muir)
+
 Optimizations
 
 * LUCENE-5468: HunspellStemFilter uses 10 to 100x less RAM. It also loads