LUCENE-9684: Hunspell: support COMPOUNDRULE (#2228)

2021-01-22 12:01:53 +01:00 · 2021-01-22 12:01:53 +01:00 · d7968130c3
parent cf5db8d651
commit d7968130c3
40 changed files with 730 additions and 29 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -86,8 +86,8 @@ API Changes
 Improvements
-* LUCENE-9665 LUCENE-9676 LUCENE-9667 : Hunspell improvements: add SpellChecker API, support default encoding and
+* LUCENE-9687: Hunspell support improvements: add SpellChecker API, support default encoding and
-  BREAK/FORBIDDENWORD affix rules, improve stemming of all-caps words (Peter Gromov)
+  BREAK/FORBIDDENWORD/COMPOUNDRULE affix rules, improve stemming of all-caps words (Peter Gromov)
 * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
  (Dawid Weiss)
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CompoundRule.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CompoundRule.java
@ -0,0 +1,105 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.hunspell;
 import java.util.List;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IntsRef;
 class CompoundRule {
  private final char[] data;
  private final Dictionary dictionary;
  CompoundRule(String rule, Dictionary dictionary) {
    this.dictionary = dictionary;
    StringBuilder parsedFlags = new StringBuilder();
    int pos = 0;
    while (pos < rule.length()) {
      int lParen = rule.indexOf("(", pos);
      if (lParen < 0) {
        parsedFlags.append(dictionary.flagParsingStrategy.parseFlags(rule.substring(pos)));
        break;
      }
      parsedFlags.append(dictionary.flagParsingStrategy.parseFlags(rule.substring(pos, lParen)));
      int rParen = rule.indexOf(')', lParen + 1);
      if (rParen < 0) {
        throw new IllegalArgumentException("Unmatched parentheses: " + rule);
      }
      parsedFlags.append(
          dictionary.flagParsingStrategy.parseFlags(rule.substring(lParen + 1, rParen)));
      pos = rParen + 1;
      if (pos < rule.length() && (rule.charAt(pos) == '?' || rule.charAt(pos) == '*')) {
        parsedFlags.append(rule.charAt(pos++));
      }
    }
    data = parsedFlags.toString().toCharArray();
  }
  boolean mayMatch(List<IntsRef> words, BytesRef scratch) {
    return match(words, 0, 0, scratch, false);
  }
  boolean fullyMatches(List<IntsRef> words, BytesRef scratch) {
    return match(words, 0, 0, scratch, true);
  }
  private boolean match(
      List<IntsRef> words, int patternIndex, int wordIndex, BytesRef scratch, boolean fully) {
    if (patternIndex >= data.length) {
      return wordIndex >= words.size();
    }
    if (wordIndex >= words.size() && !fully) {
      return true;
    }
    char flag = data[patternIndex];
    if (patternIndex < data.length - 1 && data[patternIndex + 1] == '*') {
      int startWI = wordIndex;
      while (wordIndex < words.size() && dictionary.hasFlag(words.get(wordIndex), flag, scratch)) {
        wordIndex++;
      }
      while (wordIndex >= startWI) {
        if (match(words, patternIndex + 2, wordIndex, scratch, fully)) {
          return true;
        }
        wordIndex--;
      }
      return false;
    }
    boolean currentWordMatches =
        wordIndex < words.size() && dictionary.hasFlag(words.get(wordIndex), flag, scratch);
    if (patternIndex < data.length - 1 && data[patternIndex + 1] == '?') {
      if (currentWordMatches && match(words, patternIndex + 2, wordIndex + 1, scratch, fully)) {
        return true;
      }
      return match(words, patternIndex + 2, wordIndex, scratch, fully);
    }
    return currentWordMatches && match(words, patternIndex + 1, wordIndex + 1, scratch, fully);
  }
  @Override
  public String toString() {
    return new String(data);
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -92,6 +92,8 @@ public class Dictionary {
  private static final String LANG_KEY = "LANG";
  private static final String BREAK_KEY = "BREAK";
  private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD";
  private static final String COMPOUNDMIN_KEY = "COMPOUNDMIN";
  private static final String COMPOUNDRULE_KEY = "COMPOUNDRULE";
  private static final String KEEPCASE_KEY = "KEEPCASE";
  private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
  private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
@ -136,7 +138,7 @@ public class Dictionary {
  static final int AFFIX_APPEND = 3;
  // Default flag parsing strategy
-  private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy();
+  FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy();
  // AF entries
  private String[] aliases;
@ -163,6 +165,8 @@ public class Dictionary {
  int needaffix = -1; // needaffix flag, or -1 if one is not defined
  int forbiddenword = -1; // forbiddenword flag, or -1 if one is not defined
  int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined
  int compoundMin = 3;
  List<CompoundRule> compoundRules; // nullable
  // ignored characters (dictionary, affix, inputs)
  private char[] ignore;
@ -419,6 +423,18 @@ public class Dictionary {
          throw new ParseException("Illegal FORBIDDENWORD declaration", reader.getLineNumber());
        }
        forbiddenword = flagParsingStrategy.parseFlag(parts[1]);
      } else if (line.startsWith(COMPOUNDMIN_KEY)) {
        String[] parts = line.split("\\s+");
        if (parts.length != 2) {
          throw new ParseException("Illegal COMPOUNDMIN declaration", reader.getLineNumber());
        }
        compoundMin = Math.max(1, Integer.parseInt(parts[1]));
      } else if (line.startsWith(COMPOUNDRULE_KEY)) {
        String[] parts = line.split("\\s+");
        if (parts.length != 2) {
          throw new ParseException("Illegal COMPOUNDRULE header", reader.getLineNumber());
        }
        this.compoundRules = parseCompoundRules(reader, Integer.parseInt(parts[1]));
      }
    }
@ -442,6 +458,21 @@ public class Dictionary {
    stripOffsets[currentIndex] = currentOffset;
  }
  private List<CompoundRule> parseCompoundRules(LineNumberReader reader, int num)
      throws IOException, ParseException {
    String line;
    List<CompoundRule> compoundRules = new ArrayList<>();
    for (int i = 0; i < num; i++) {
      line = reader.readLine();
      String[] parts = line.split("\\s+");
      if (!line.startsWith(COMPOUNDRULE_KEY) || parts.length != 2) {
        throw new ParseException("COMPOUNDRULE rule expected", reader.getLineNumber());
      }
      compoundRules.add(new CompoundRule(parts[1], this));
    }
    return compoundRules;
  }
  private Breaks parseBreaks(LineNumberReader reader, String line)
      throws IOException, ParseException {
    Set<String> starting = new LinkedHashSet<>();
@ -910,7 +941,7 @@ public class Dictionary {
      reuse.append(caseFold(word.charAt(i)));
    }
    reuse.append(FLAG_SEPARATOR);
-    reuse.append(HIDDEN_FLAG);
+    flagParsingStrategy.appendFlag(HIDDEN_FLAG, reuse);
    reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
    writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
  }
@ -1188,16 +1219,19 @@ public class Dictionary {
    return null;
  }
-  boolean isForbiddenWord(char[] word, BytesRef scratch) {
+  boolean isForbiddenWord(char[] word, int length, BytesRef scratch) {
    if (forbiddenword != -1) {
-      IntsRef forms = lookupWord(word, 0, word.length);
+      IntsRef forms = lookupWord(word, 0, length);
-      if (forms != null) {
+      return forms != null && hasFlag(forms, (char) forbiddenword, scratch);
-        int formStep = formStep();
+    }
-        for (int i = 0; i < forms.length; i += formStep) {
+    return false;
-          if (hasFlag(forms.ints[forms.offset + i], (char) forbiddenword, scratch)) {
+  }
-            return true;
+
-          }
+  boolean hasFlag(IntsRef forms, char flag, BytesRef scratch) {
-        }
+    int formStep = formStep();
    for (int i = 0; i < forms.length; i += formStep) {
      if (hasFlag(forms.ints[forms.offset + i], flag, scratch)) {
        return true;
      }
    }
    return false;
@ -1227,6 +1261,8 @@ public class Dictionary {
     * @return Parsed flags
     */
    abstract char[] parseFlags(String rawFlags);
    abstract void appendFlag(char flag, StringBuilder to);
  }
  /**
@ -1238,6 +1274,11 @@ public class Dictionary {
    public char[] parseFlags(String rawFlags) {
      return rawFlags.toCharArray();
    }
    @Override
    void appendFlag(char flag, StringBuilder to) {
      to.append(flag);
    }
  }
  /**
@ -1266,6 +1307,14 @@ public class Dictionary {
      }
      return flags;
    }
    @Override
    void appendFlag(char flag, StringBuilder to) {
      if (to.length() > 0) {
        to.append(",");
      }
      to.append((int) flag);
    }
  }
  /**
@ -1300,6 +1349,16 @@ public class Dictionary {
      builder.getChars(0, builder.length(), flags, 0);
      return flags;
    }
    @Override
    void appendFlag(char flag, StringBuilder to) {
      to.append((char) (flag >> 8));
      to.append((char) (flag & 0xff));
    }
  }
  boolean hasCompounding() {
    return compoundRules != null;
  }
  boolean hasFlag(int entryId, char flag, BytesRef scratch) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@ -16,7 +16,10 @@
 */
 package org.apache.lucene.analysis.hunspell;
 import java.util.ArrayList;
 import java.util.List;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IntsRef;
 /**
 * A spell checker based on Hunspell dictionaries. The objects of this class are not thread-safe
@ -37,26 +40,100 @@ public class SpellChecker {
  public boolean spell(String word) {
    if (word.isEmpty()) return true;
-    char[] wordChars = word.toCharArray();
+    if (dictionary.needsInputCleaning) {
-    if (dictionary.isForbiddenWord(wordChars, scratch)) {
+      word = dictionary.cleanInput(word, new StringBuilder()).toString();
      return false;
    }
    if (isNumber(word)) {
      return true;
    }
-    if (!stemmer.stem(wordChars, word.length()).isEmpty()) {
+    char[] wordChars = word.toCharArray();
    if (checkWord(wordChars, wordChars.length, false)) {
      return true;
    }
-    if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
+    WordCase wc = stemmer.caseOf(wordChars, wordChars.length);
    if ((wc == WordCase.UPPER || wc == WordCase.TITLE) && checkCaseVariants(wordChars, wc)) {
      return true;
    }
    if (dictionary.breaks.isNotEmpty()
        && !hasTooManyBreakOccurrences(word)
        && !dictionary.isForbiddenWord(wordChars, word.length(), scratch)) {
      return tryBreaks(word);
    }
    return false;
  }
  private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) {
    char[] caseVariant = wordChars;
    if (wordCase == WordCase.UPPER) {
      caseVariant = stemmer.caseFoldTitle(caseVariant, wordChars.length);
      if (checkWord(caseVariant, wordChars.length, true)) {
        return true;
      }
    }
    return checkWord(stemmer.caseFoldLower(caseVariant, wordChars.length), wordChars.length, true);
  }
  private boolean checkWord(char[] wordChars, int length, boolean caseVariant) {
    if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
      return false;
    }
    if (!stemmer.doStem(wordChars, length, caseVariant).isEmpty()) {
      return true;
    }
    if (dictionary.hasCompounding()) {
      return checkCompounds(wordChars, 0, length, new ArrayList<>());
    }
    return false;
  }
  private boolean checkCompounds(char[] wordChars, int offset, int length, List<IntsRef> words) {
    if (words.size() >= 100) return false;
    int limit = length - dictionary.compoundMin + 1;
    for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
      IntsRef forms = dictionary.lookupWord(wordChars, offset, breakPos);
      if (forms != null) {
        words.add(forms);
        if (dictionary.compoundRules != null
            && dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words, scratch))) {
          if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) {
            return true;
          }
          if (checkCompounds(wordChars, offset + breakPos, length - breakPos, words)) {
            return true;
          }
        }
        words.remove(words.size() - 1);
      }
    }
    return false;
  }
  private boolean checkLastCompoundPart(
      char[] wordChars, int start, int length, List<IntsRef> words) {
    IntsRef forms = dictionary.lookupWord(wordChars, start, length);
    if (forms == null) return false;
    words.add(forms);
    boolean result =
        dictionary.compoundRules != null
            && dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch));
    words.remove(words.size() - 1);
    return result;
  }
  private static boolean isNumber(String s) {
    int i = 0;
    while (i < s.length()) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -112,8 +112,8 @@ final class Stemmer {
  private char[] titleBuffer = new char[8];
  /** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
-  private WordCase caseOf(char[] word, int length) {
+  WordCase caseOf(char[] word, int length) {
-    if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) {
+    if (dictionary.ignoreCase || length == 0 || Character.isLowerCase(word[0])) {
      return WordCase.MIXED;
    }
@ -121,22 +121,24 @@ final class Stemmer {
  }
  /** folds titlecase variant of word to titleBuffer */
-  private void caseFoldTitle(char[] word, int length) {
+  char[] caseFoldTitle(char[] word, int length) {
    titleBuffer = ArrayUtil.grow(titleBuffer, length);
    System.arraycopy(word, 0, titleBuffer, 0, length);
    for (int i = 1; i < length; i++) {
      titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
    }
    return titleBuffer;
  }
  /** folds lowercase variant of word (title cased) to lowerBuffer */
-  private void caseFoldLower(char[] word, int length) {
+  char[] caseFoldLower(char[] word, int length) {
    lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
    System.arraycopy(word, 0, lowerBuffer, 0, length);
    lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
    return lowerBuffer;
  }
-  private List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
+  List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
    List<CharsRef> stems = new ArrayList<>();
    IntsRef forms = dictionary.lookupWord(word, 0, length);
    if (forms != null) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
@ -23,7 +23,7 @@ enum WordCase {
  MIXED;
  static WordCase caseOf(char[] word, int length) {
-    boolean capitalized = Character.isUpperCase(word[0]);
+    boolean startsWithLower = Character.isLowerCase(word[0]);
    boolean seenUpper = false;
    boolean seenLower = false;
@ -34,11 +34,11 @@ enum WordCase {
      if (seenUpper && seenLower) break;
    }
-    return get(capitalized, seenUpper, seenLower);
+    return get(startsWithLower, seenUpper, seenLower);
  }
  static WordCase caseOf(CharSequence word, int length) {
-    boolean capitalized = Character.isUpperCase(word.charAt(0));
+    boolean startsWithLower = Character.isLowerCase(word.charAt(0));
    boolean seenUpper = false;
    boolean seenLower = false;
@ -49,11 +49,11 @@ enum WordCase {
      if (seenUpper && seenLower) break;
    }
-    return get(capitalized, seenUpper, seenLower);
+    return get(startsWithLower, seenUpper, seenLower);
  }
-  private static WordCase get(boolean capitalized, boolean seenUpper, boolean seenLower) {
+  private static WordCase get(boolean startsWithLower, boolean seenUpper, boolean seenLower) {
-    if (capitalized) {
+    if (!startsWithLower) {
      return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED;
    }
    return seenUpper ? MIXED : LOWER;
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@ -43,6 +43,38 @@ public class SpellCheckerTest extends StemmerTestBase {
    doTest("breakoff");
  }
  public void testCompoundrule() throws Exception {
    doTest("compoundrule");
  }
  public void testCompoundrule2() throws Exception {
    doTest("compoundrule2");
  }
  public void testCompoundrule3() throws Exception {
    doTest("compoundrule3");
  }
  public void testCompoundrule4() throws Exception {
    doTest("compoundrule4");
  }
  public void testCompoundrule5() throws Exception {
    doTest("compoundrule5");
  }
  public void testCompoundrule6() throws Exception {
    doTest("compoundrule6");
  }
  public void testCompoundrule7() throws Exception {
    doTest("compoundrule7");
  }
  public void testCompoundrule8() throws Exception {
    doTest("compoundrule8");
  }
  protected void doTest(String name) throws Exception {
    InputStream affixStream =
        Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name);
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@ -22,6 +22,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.text.ParseException;
 import java.util.Random;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
@ -33,6 +34,7 @@ import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.FSTCompiler;
 import org.apache.lucene.util.fst.Outputs;
 import org.apache.lucene.util.fst.Util;
 import org.junit.Test;
 public class TestDictionary extends LuceneTestCase {
@ -268,6 +270,27 @@ public class TestDictionary extends LuceneTestCase {
    assertNotNull(Dictionary.getFlagParsingStrategy("FLAG    UTF-8"));
  }
  @Test
  public void testFlagSerialization() {
    Random r = random();
    char[] flags = new char[r.nextInt(10)];
    for (int i = 0; i < flags.length; i++) {
      flags[i] = (char) r.nextInt(Character.MAX_VALUE);
    }
    String[] flagLines = {"FLAG long", "FLAG UTF-8", "FLAG num"};
    for (String flagLine : flagLines) {
      Dictionary.FlagParsingStrategy strategy = Dictionary.getFlagParsingStrategy(flagLine);
      StringBuilder serialized = new StringBuilder();
      for (char flag : flags) {
        strategy.appendFlag(flag, serialized);
      }
      char[] deserialized = strategy.parseFlags(serialized.toString());
      assertEquals(new String(flags), new String(deserialized));
    }
  }
  private Directory getDirectory() {
    return newDirectory();
  }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.aff
@ -0,0 +1,3 @@
 COMPOUNDMIN 1
 COMPOUNDRULE 1
 COMPOUNDRULE ABC
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.dic
@ -0,0 +1,5 @@
 3
 a/A
 b/B
 c/BC
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.good
@ -0,0 +1,2 @@
 abc
 acc
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.wrong
@ -0,0 +1,39 @@
 ba
 aaabaaa
 bbaaa
 aaaaba
 bbbbbaa
 aa
 aaa
 aaaa
 ab
 aab
 aaab
 aaaab
 abb
 aabb
 aaabbb
 bb
 bbb
 bbbb
 aaab
 abcc
 abbc
 abbcc
 aabc
 aabcc
 aabbc
 aabbcc
 aaabbbccc
 ac
 aac
 aacc
 aaaccc
 bc
 bcc
 bbc
 bbcc
 bbbccc
 cc
 ccc
 cccccc
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.aff
@ -0,0 +1,3 @@
 COMPOUNDMIN 1
 COMPOUNDRULE 1
 COMPOUNDRULE A*B*C*
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.dic
@ -0,0 +1,5 @@
 3
 a/A
 b/B
 c/C
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.good
@ -0,0 +1,37 @@
 aa
 aaa
 aaaa
 ab
 aab
 aaab
 aaaab
 abb
 aabb
 aaabbb
 bb
 bbb
 bbbb
 aaab
 abc
 abcc
 abbc
 abbcc
 aabc
 aabcc
 aabbc
 aabbcc
 aaabbbccc
 ac
 acc
 aac
 aacc
 aaaccc
 bc
 bcc
 bbc
 bbcc
 bbbccc
 cc
 ccc
 cccccc
 abcc
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.wrong
@ -0,0 +1,8 @@
 ba
 aaabaaa
 bbaaa
 aaaaba
 bbbbbaa
 cba
 cab
 acb
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.aff
@ -0,0 +1,3 @@
 COMPOUNDMIN 1
 COMPOUNDRULE 1
 COMPOUNDRULE A?B?C?
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.dic
@ -0,0 +1,5 @@
 3
 a/A
 b/B
 c/C
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.good
@ -0,0 +1,7 @@
 a
 b
 c
 ab
 abc
 ac
 bc
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.wrong
@ -0,0 +1,41 @@
 aa
 aaa
 aaaa
 aab
 aaab
 aaaab
 abb
 aabb
 aaabbb
 bb
 bbb
 bbbb
 aaab
 abcc
 abbc
 abbcc
 aabc
 aabcc
 aabbc
 aabbcc
 aaabbbccc
 acc
 aac
 aacc
 aaaccc
 bcc
 bbc
 bbcc
 bbbccc
 cc
 ccc
 cccccc
 abcc
 ba
 aaabaaa
 bbaaa
 aaaaba
 bbbbbaa
 cba
 cab
 acb
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff
@ -0,0 +1,7 @@
 # English ordinal numbers
 WORDCHARS 0123456789
 COMPOUNDMIN 1
 ONLYINCOMPOUND c
 COMPOUNDRULE 2
 COMPOUNDRULE n*1t
 COMPOUNDRULE n*mp
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic
@ -0,0 +1,24 @@
 22
 0/nm
 1/n1
 2/nm
 3/nm
 4/nm
 5/nm
 6/nm
 7/nm
 8/nm
 9/nm
 0th/pt
 1st/p
 1th/tc
 2nd/p
 2th/tc
 3rd/p
 3th/tc
 4th/pt
 5th/pt
 6th/pt
 7th/pt
 8th/pt
 9th/pt
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good
@ -0,0 +1,31 @@
 1st
 2nd
 3rd
 4th
 5th
 6th
 7th
 8th
 9th
 10th
 11th
 12th
 13th
 14th
 15th
 16th
 17th
 18th
 19th
 20th
 21st
 22nd
 23rd
 24th
 25th
 100th
 1000th
 10001st
 10011th
 1ST
 42ND
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.wrong
@ -0,0 +1,5 @@
 1th
 2th
 3th
 10001th
 10011st
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.aff
@ -0,0 +1,7 @@
 # number + percent
 SET UTF-8
 COMPOUNDMIN 1
 COMPOUNDRULE 2
 COMPOUNDRULE N*%?
 COMPOUNDRULE NN*.NN*%?
 WORDCHARS 0123456789‰.
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.dic
@ -0,0 +1,14 @@
 13
 0/N	po:num
 1/N	po:num
 2/N	po:num
 3/N	po:num
 4/N	po:num
 5/N	po:num
 6/N	po:num
 7/N	po:num
 8/N	po:num
 9/N	po:num
 ./.	po:sign_dot
 %/%	po:sign_percent
 ‰/%	po:sign_per_mille
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.good
@ -0,0 +1,7 @@
 10%
 0.2%
 0.20%
 123.4561‰
 10
 0000
 10.25
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.wrong
@ -0,0 +1 @@
 .25
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.aff
@ -0,0 +1,4 @@
 COMPOUNDMIN 1
 COMPOUNDRULE 2
 COMPOUNDRULE A*A
 COMPOUNDRULE A*AAB*BBBC*C
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.dic
@ -0,0 +1,5 @@
 3
 a/A
 b/B
 c/C
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.good
@ -0,0 +1,4 @@
 aa
 aaaaaa
 aabbbc
 aaaaabbbbbbcccccc
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.wrong
@ -0,0 +1,4 @@
 abc
 abbbbbccccccc
 aabbccccccc
 aabbbbbbb
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.aff
@ -0,0 +1,8 @@
 # English ordinal numbers (parenthesized long flags)
 FLAG long
 WORDCHARS 0123456789
 COMPOUNDMIN 1
 ONLYINCOMPOUND cc
 COMPOUNDRULE 2
 COMPOUNDRULE (nn)*(11)(tt)
 COMPOUNDRULE (nn)*(mm)(pp)
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.dic
@ -0,0 +1,24 @@
 22
 0/nnmm
 1/nn11
 2/nnmm
 3/nnmm
 4/nnmm
 5/nnmm
 6/nnmm
 7/nnmm
 8/nnmm
 9/nnmm
 0th/pptt
 1st/pp
 1th/ttcc
 2nd/pp
 2th/ttcc
 3rd/pp
 3th/ttcc
 4th/pptt
 5th/pptt
 6th/pptt
 7th/pptt
 8th/pptt
 9th/pptt
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.good
@ -0,0 +1,29 @@
 1st
 2nd
 3rd
 4th
 5th
 6th
 7th
 8th
 9th
 10th
 11th
 12th
 13th
 14th
 15th
 16th
 17th
 18th
 19th
 20th
 21st
 22nd
 23rd
 24th
 25th
 100th
 1000th
 10001st
 10011th
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.wrong
@ -0,0 +1,5 @@
 1th
 2th
 3th
 10001th
 10011st
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.aff
@ -0,0 +1,8 @@
 # English ordinal numbers (parenthesized numerical flags)
 FLAG num
 WORDCHARS 0123456789
 COMPOUNDMIN 1
 ONLYINCOMPOUND 1000
 COMPOUNDRULE 2
 COMPOUNDRULE (1001)*(1002)(2001)
 COMPOUNDRULE (1001)*(2002)(2000)
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.dic
@ -0,0 +1,24 @@
 22
 0/1001,2002
 1/1001,1002
 2/1001,2002
 3/1001,2002
 4/1001,2002
 5/1001,2002
 6/1001,2002
 7/1001,2002
 8/1001,2002
 9/1001,2002
 0th/2000,2001
 1st/2000
 1th/2001,1000
 2nd/2000
 2th/2001,1000
 3rd/2000
 3th/2001,1000
 4th/2000,2001
 5th/2000,2001
 6th/2000,2001
 7th/2000,2001
 8th/2000,2001
 9th/2000,2001
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.good
@ -0,0 +1,29 @@
 1st
 2nd
 3rd
 4th
 5th
 6th
 7th
 8th
 9th
 10th
 11th
 12th
 13th
 14th
 15th
 16th
 17th
 18th
 19th
 20th
 21st
 22nd
 23rd
 24th
 25th
 100th
 1000th
 10001st
 10011th
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.wrong
@ -0,0 +1,5 @@
 1th
 2th
 3th
 10001th
 10011st
+/nm
+/n1
+/nm
+/nm
+/nm
+/nm
+/nm
+/nm
+/nm
+/nm
+th/pt
+st/p
+th/tc
+nd/p
+th/tc
+rd/p
+th/tc
+th/pt
+th/pt
+th/pt
+th/pt
+th/pt
+th/pt
+st
+nd
+rd
+th
+th
+th
+th
+th
+th
+th
+th
+th
+th
+th
+th
+th
+th
+th
+th
+th
+st
+nd
+rd
+th
+th
+th
+th
+st
+th
+ST
+ND
+/nnmm
+/nn11
+/nnmm
+/nnmm
+/nnmm
+/nnmm
+/nnmm
+/nnmm
+/nnmm
+/nnmm
+th/pptt
+st/pp
+th/ttcc
+nd/pp
+th/ttcc
+rd/pp
+th/ttcc
+th/pptt
+th/pptt
+th/pptt
+th/pptt
+th/pptt
+th/pptt