LUCENE-9684: Hunspell: support COMPOUNDRULE (#2228)

2021-01-22 12:01:53 +01:00 · 2021-01-22 12:01:53 +01:00 · d7968130c3
parent cf5db8d651
commit d7968130c3
40 changed files with 730 additions and 29 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -86,8 +86,8 @@ API Changes

 Improvements

-* LUCENE-9665 LUCENE-9676 LUCENE-9667 : Hunspell improvements: add SpellChecker API, support default encoding and
-  BREAK/FORBIDDENWORD affix rules, improve stemming of all-caps words (Peter Gromov)
+* LUCENE-9687: Hunspell support improvements: add SpellChecker API, support default encoding and
+  BREAK/FORBIDDENWORD/COMPOUNDRULE affix rules, improve stemming of all-caps words (Peter Gromov)

 * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
  (Dawid Weiss)
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CompoundRule.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CompoundRule.java
@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.util.List;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+
+class CompoundRule {
+  private final char[] data;
+  private final Dictionary dictionary;
+
+  CompoundRule(String rule, Dictionary dictionary) {
+    this.dictionary = dictionary;
+    StringBuilder parsedFlags = new StringBuilder();
+    int pos = 0;
+    while (pos < rule.length()) {
+      int lParen = rule.indexOf("(", pos);
+      if (lParen < 0) {
+        parsedFlags.append(dictionary.flagParsingStrategy.parseFlags(rule.substring(pos)));
+        break;
+      }
+
+      parsedFlags.append(dictionary.flagParsingStrategy.parseFlags(rule.substring(pos, lParen)));
+      int rParen = rule.indexOf(')', lParen + 1);
+      if (rParen < 0) {
+        throw new IllegalArgumentException("Unmatched parentheses: " + rule);
+      }
+
+      parsedFlags.append(
+          dictionary.flagParsingStrategy.parseFlags(rule.substring(lParen + 1, rParen)));
+      pos = rParen + 1;
+      if (pos < rule.length() && (rule.charAt(pos) == '?' || rule.charAt(pos) == '*')) {
+        parsedFlags.append(rule.charAt(pos++));
+      }
+    }
+    data = parsedFlags.toString().toCharArray();
+  }
+
+  boolean mayMatch(List<IntsRef> words, BytesRef scratch) {
+    return match(words, 0, 0, scratch, false);
+  }
+
+  boolean fullyMatches(List<IntsRef> words, BytesRef scratch) {
+    return match(words, 0, 0, scratch, true);
+  }
+
+  private boolean match(
+      List<IntsRef> words, int patternIndex, int wordIndex, BytesRef scratch, boolean fully) {
+    if (patternIndex >= data.length) {
+      return wordIndex >= words.size();
+    }
+    if (wordIndex >= words.size() && !fully) {
+      return true;
+    }
+
+    char flag = data[patternIndex];
+    if (patternIndex < data.length - 1 && data[patternIndex + 1] == '*') {
+      int startWI = wordIndex;
+      while (wordIndex < words.size() && dictionary.hasFlag(words.get(wordIndex), flag, scratch)) {
+        wordIndex++;
+      }
+
+      while (wordIndex >= startWI) {
+        if (match(words, patternIndex + 2, wordIndex, scratch, fully)) {
+          return true;
+        }
+
+        wordIndex--;
+      }
+      return false;
+    }
+
+    boolean currentWordMatches =
+        wordIndex < words.size() && dictionary.hasFlag(words.get(wordIndex), flag, scratch);
+
+    if (patternIndex < data.length - 1 && data[patternIndex + 1] == '?') {
+      if (currentWordMatches && match(words, patternIndex + 2, wordIndex + 1, scratch, fully)) {
+        return true;
+      }
+      return match(words, patternIndex + 2, wordIndex, scratch, fully);
+    }
+
+    return currentWordMatches && match(words, patternIndex + 1, wordIndex + 1, scratch, fully);
+  }
+
+  @Override
+  public String toString() {
+    return new String(data);
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -92,6 +92,8 @@ public class Dictionary {
  private static final String LANG_KEY = "LANG";
  private static final String BREAK_KEY = "BREAK";
  private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD";
+  private static final String COMPOUNDMIN_KEY = "COMPOUNDMIN";
+  private static final String COMPOUNDRULE_KEY = "COMPOUNDRULE";
  private static final String KEEPCASE_KEY = "KEEPCASE";
  private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
  private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
@ -136,7 +138,7 @@ public class Dictionary {
  static final int AFFIX_APPEND = 3;

  // Default flag parsing strategy
-  private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy();
+  FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy();

  // AF entries
  private String[] aliases;
@ -163,6 +165,8 @@ public class Dictionary {
  int needaffix = -1; // needaffix flag, or -1 if one is not defined
  int forbiddenword = -1; // forbiddenword flag, or -1 if one is not defined
  int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined
+  int compoundMin = 3;
+  List<CompoundRule> compoundRules; // nullable

  // ignored characters (dictionary, affix, inputs)
  private char[] ignore;
@ -419,6 +423,18 @@ public class Dictionary {
          throw new ParseException("Illegal FORBIDDENWORD declaration", reader.getLineNumber());
        }
        forbiddenword = flagParsingStrategy.parseFlag(parts[1]);
+      } else if (line.startsWith(COMPOUNDMIN_KEY)) {
+        String[] parts = line.split("\\s+");
+        if (parts.length != 2) {
+          throw new ParseException("Illegal COMPOUNDMIN declaration", reader.getLineNumber());
+        }
+        compoundMin = Math.max(1, Integer.parseInt(parts[1]));
+      } else if (line.startsWith(COMPOUNDRULE_KEY)) {
+        String[] parts = line.split("\\s+");
+        if (parts.length != 2) {
+          throw new ParseException("Illegal COMPOUNDRULE header", reader.getLineNumber());
+        }
+        this.compoundRules = parseCompoundRules(reader, Integer.parseInt(parts[1]));
      }
    }

@ -442,6 +458,21 @@ public class Dictionary {
    stripOffsets[currentIndex] = currentOffset;
  }

+  private List<CompoundRule> parseCompoundRules(LineNumberReader reader, int num)
+      throws IOException, ParseException {
+    String line;
+    List<CompoundRule> compoundRules = new ArrayList<>();
+    for (int i = 0; i < num; i++) {
+      line = reader.readLine();
+      String[] parts = line.split("\\s+");
+      if (!line.startsWith(COMPOUNDRULE_KEY) || parts.length != 2) {
+        throw new ParseException("COMPOUNDRULE rule expected", reader.getLineNumber());
+      }
+      compoundRules.add(new CompoundRule(parts[1], this));
+    }
+    return compoundRules;
+  }
+
  private Breaks parseBreaks(LineNumberReader reader, String line)
      throws IOException, ParseException {
    Set<String> starting = new LinkedHashSet<>();
@ -910,7 +941,7 @@ public class Dictionary {
      reuse.append(caseFold(word.charAt(i)));
    }
    reuse.append(FLAG_SEPARATOR);
-    reuse.append(HIDDEN_FLAG);
+    flagParsingStrategy.appendFlag(HIDDEN_FLAG, reuse);
    reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
    writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
  }
@ -1188,16 +1219,19 @@ public class Dictionary {
    return null;
  }

-  boolean isForbiddenWord(char[] word, BytesRef scratch) {
+  boolean isForbiddenWord(char[] word, int length, BytesRef scratch) {
    if (forbiddenword != -1) {
-      IntsRef forms = lookupWord(word, 0, word.length);
-      if (forms != null) {
-        int formStep = formStep();
-        for (int i = 0; i < forms.length; i += formStep) {
-          if (hasFlag(forms.ints[forms.offset + i], (char) forbiddenword, scratch)) {
-            return true;
-          }
-        }
+      IntsRef forms = lookupWord(word, 0, length);
+      return forms != null && hasFlag(forms, (char) forbiddenword, scratch);
+    }
+    return false;
+  }
+
+  boolean hasFlag(IntsRef forms, char flag, BytesRef scratch) {
+    int formStep = formStep();
+    for (int i = 0; i < forms.length; i += formStep) {
+      if (hasFlag(forms.ints[forms.offset + i], flag, scratch)) {
+        return true;
      }
    }
    return false;
@ -1227,6 +1261,8 @@ public class Dictionary {
     * @return Parsed flags
     */
    abstract char[] parseFlags(String rawFlags);
+
+    abstract void appendFlag(char flag, StringBuilder to);
  }

  /**
@ -1238,6 +1274,11 @@ public class Dictionary {
    public char[] parseFlags(String rawFlags) {
      return rawFlags.toCharArray();
    }
+
+    @Override
+    void appendFlag(char flag, StringBuilder to) {
+      to.append(flag);
+    }
  }

  /**
@ -1266,6 +1307,14 @@ public class Dictionary {
      }
      return flags;
    }
+
+    @Override
+    void appendFlag(char flag, StringBuilder to) {
+      if (to.length() > 0) {
+        to.append(",");
+      }
+      to.append((int) flag);
+    }
  }

  /**
@ -1300,6 +1349,16 @@ public class Dictionary {
      builder.getChars(0, builder.length(), flags, 0);
      return flags;
    }
+
+    @Override
+    void appendFlag(char flag, StringBuilder to) {
+      to.append((char) (flag >> 8));
+      to.append((char) (flag & 0xff));
+    }
+  }
+
+  boolean hasCompounding() {
+    return compoundRules != null;
  }

  boolean hasFlag(int entryId, char flag, BytesRef scratch) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@ -16,7 +16,10 @@
 */
 package org.apache.lucene.analysis.hunspell;

+import java.util.ArrayList;
+import java.util.List;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;

 /**
 * A spell checker based on Hunspell dictionaries. The objects of this class are not thread-safe
@ -37,26 +40,100 @@ public class SpellChecker {
  public boolean spell(String word) {
    if (word.isEmpty()) return true;

-    char[] wordChars = word.toCharArray();
-    if (dictionary.isForbiddenWord(wordChars, scratch)) {
-      return false;
+    if (dictionary.needsInputCleaning) {
+      word = dictionary.cleanInput(word, new StringBuilder()).toString();
    }

    if (isNumber(word)) {
      return true;
    }

-    if (!stemmer.stem(wordChars, word.length()).isEmpty()) {
+    char[] wordChars = word.toCharArray();
+    if (checkWord(wordChars, wordChars.length, false)) {
      return true;
    }

-    if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
+    WordCase wc = stemmer.caseOf(wordChars, wordChars.length);
+    if ((wc == WordCase.UPPER || wc == WordCase.TITLE) && checkCaseVariants(wordChars, wc)) {
+      return true;
+    }
+
+    if (dictionary.breaks.isNotEmpty()
+        && !hasTooManyBreakOccurrences(word)
+        && !dictionary.isForbiddenWord(wordChars, word.length(), scratch)) {
      return tryBreaks(word);
    }

    return false;
  }

+  private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) {
+    char[] caseVariant = wordChars;
+    if (wordCase == WordCase.UPPER) {
+      caseVariant = stemmer.caseFoldTitle(caseVariant, wordChars.length);
+      if (checkWord(caseVariant, wordChars.length, true)) {
+        return true;
+      }
+    }
+    return checkWord(stemmer.caseFoldLower(caseVariant, wordChars.length), wordChars.length, true);
+  }
+
+  private boolean checkWord(char[] wordChars, int length, boolean caseVariant) {
+    if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
+      return false;
+    }
+
+    if (!stemmer.doStem(wordChars, length, caseVariant).isEmpty()) {
+      return true;
+    }
+
+    if (dictionary.hasCompounding()) {
+      return checkCompounds(wordChars, 0, length, new ArrayList<>());
+    }
+
+    return false;
+  }
+
+  private boolean checkCompounds(char[] wordChars, int offset, int length, List<IntsRef> words) {
+    if (words.size() >= 100) return false;
+
+    int limit = length - dictionary.compoundMin + 1;
+    for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
+      IntsRef forms = dictionary.lookupWord(wordChars, offset, breakPos);
+      if (forms != null) {
+        words.add(forms);
+
+        if (dictionary.compoundRules != null
+            && dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words, scratch))) {
+          if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) {
+            return true;
+          }
+
+          if (checkCompounds(wordChars, offset + breakPos, length - breakPos, words)) {
+            return true;
+          }
+        }
+
+        words.remove(words.size() - 1);
+      }
+    }
+
+    return false;
+  }
+
+  private boolean checkLastCompoundPart(
+      char[] wordChars, int start, int length, List<IntsRef> words) {
+    IntsRef forms = dictionary.lookupWord(wordChars, start, length);
+    if (forms == null) return false;
+
+    words.add(forms);
+    boolean result =
+        dictionary.compoundRules != null
+            && dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch));
+    words.remove(words.size() - 1);
+    return result;
+  }
+
  private static boolean isNumber(String s) {
    int i = 0;
    while (i < s.length()) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -112,8 +112,8 @@ final class Stemmer {
  private char[] titleBuffer = new char[8];

  /** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
-  private WordCase caseOf(char[] word, int length) {
-    if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) {
+  WordCase caseOf(char[] word, int length) {
+    if (dictionary.ignoreCase || length == 0 || Character.isLowerCase(word[0])) {
      return WordCase.MIXED;
    }

@ -121,22 +121,24 @@ final class Stemmer {
  }

  /** folds titlecase variant of word to titleBuffer */
-  private void caseFoldTitle(char[] word, int length) {
+  char[] caseFoldTitle(char[] word, int length) {
    titleBuffer = ArrayUtil.grow(titleBuffer, length);
    System.arraycopy(word, 0, titleBuffer, 0, length);
    for (int i = 1; i < length; i++) {
      titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
    }
+    return titleBuffer;
  }

  /** folds lowercase variant of word (title cased) to lowerBuffer */
-  private void caseFoldLower(char[] word, int length) {
+  char[] caseFoldLower(char[] word, int length) {
    lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
    System.arraycopy(word, 0, lowerBuffer, 0, length);
    lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
+    return lowerBuffer;
  }

-  private List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
+  List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
    List<CharsRef> stems = new ArrayList<>();
    IntsRef forms = dictionary.lookupWord(word, 0, length);
    if (forms != null) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
@ -23,7 +23,7 @@ enum WordCase {
  MIXED;

  static WordCase caseOf(char[] word, int length) {
-    boolean capitalized = Character.isUpperCase(word[0]);
+    boolean startsWithLower = Character.isLowerCase(word[0]);

    boolean seenUpper = false;
    boolean seenLower = false;
@ -34,11 +34,11 @@ enum WordCase {
      if (seenUpper && seenLower) break;
    }

-    return get(capitalized, seenUpper, seenLower);
+    return get(startsWithLower, seenUpper, seenLower);
  }

  static WordCase caseOf(CharSequence word, int length) {
-    boolean capitalized = Character.isUpperCase(word.charAt(0));
+    boolean startsWithLower = Character.isLowerCase(word.charAt(0));

    boolean seenUpper = false;
    boolean seenLower = false;
@ -49,11 +49,11 @@ enum WordCase {
      if (seenUpper && seenLower) break;
    }

-    return get(capitalized, seenUpper, seenLower);
+    return get(startsWithLower, seenUpper, seenLower);
  }

-  private static WordCase get(boolean capitalized, boolean seenUpper, boolean seenLower) {
-    if (capitalized) {
+  private static WordCase get(boolean startsWithLower, boolean seenUpper, boolean seenLower) {
+    if (!startsWithLower) {
      return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED;
    }
    return seenUpper ? MIXED : LOWER;
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@ -43,6 +43,38 @@ public class SpellCheckerTest extends StemmerTestBase {
    doTest("breakoff");
  }

+  public void testCompoundrule() throws Exception {
+    doTest("compoundrule");
+  }
+
+  public void testCompoundrule2() throws Exception {
+    doTest("compoundrule2");
+  }
+
+  public void testCompoundrule3() throws Exception {
+    doTest("compoundrule3");
+  }
+
+  public void testCompoundrule4() throws Exception {
+    doTest("compoundrule4");
+  }
+
+  public void testCompoundrule5() throws Exception {
+    doTest("compoundrule5");
+  }
+
+  public void testCompoundrule6() throws Exception {
+    doTest("compoundrule6");
+  }
+
+  public void testCompoundrule7() throws Exception {
+    doTest("compoundrule7");
+  }
+
+  public void testCompoundrule8() throws Exception {
+    doTest("compoundrule8");
+  }
+
  protected void doTest(String name) throws Exception {
    InputStream affixStream =
        Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name);
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@ -22,6 +22,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.text.ParseException;
+import java.util.Random;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
@ -33,6 +34,7 @@ import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.FSTCompiler;
 import org.apache.lucene.util.fst.Outputs;
 import org.apache.lucene.util.fst.Util;
+import org.junit.Test;

 public class TestDictionary extends LuceneTestCase {

@ -268,6 +270,27 @@ public class TestDictionary extends LuceneTestCase {
    assertNotNull(Dictionary.getFlagParsingStrategy("FLAG    UTF-8"));
  }

+  @Test
+  public void testFlagSerialization() {
+    Random r = random();
+    char[] flags = new char[r.nextInt(10)];
+    for (int i = 0; i < flags.length; i++) {
+      flags[i] = (char) r.nextInt(Character.MAX_VALUE);
+    }
+
+    String[] flagLines = {"FLAG long", "FLAG UTF-8", "FLAG num"};
+    for (String flagLine : flagLines) {
+      Dictionary.FlagParsingStrategy strategy = Dictionary.getFlagParsingStrategy(flagLine);
+      StringBuilder serialized = new StringBuilder();
+      for (char flag : flags) {
+        strategy.appendFlag(flag, serialized);
+      }
+
+      char[] deserialized = strategy.parseFlags(serialized.toString());
+      assertEquals(new String(flags), new String(deserialized));
+    }
+  }
+
  private Directory getDirectory() {
    return newDirectory();
  }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.aff
@ -0,0 +1,3 @@
+COMPOUNDMIN 1
+COMPOUNDRULE 1
+COMPOUNDRULE ABC
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.dic
@ -0,0 +1,5 @@
+3
+a/A
+b/B
+c/BC
+
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.good
@ -0,0 +1,2 @@
+abc
+acc
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.wrong
@ -0,0 +1,39 @@
+ba
+aaabaaa
+bbaaa
+aaaaba
+bbbbbaa
+aa
+aaa
+aaaa
+ab
+aab
+aaab
+aaaab
+abb
+aabb
+aaabbb
+bb
+bbb
+bbbb
+aaab
+abcc
+abbc
+abbcc
+aabc
+aabcc
+aabbc
+aabbcc
+aaabbbccc
+ac
+aac
+aacc
+aaaccc
+bc
+bcc
+bbc
+bbcc
+bbbccc
+cc
+ccc
+cccccc
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.aff
@ -0,0 +1,3 @@
+COMPOUNDMIN 1
+COMPOUNDRULE 1
+COMPOUNDRULE A*B*C*
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.dic
@ -0,0 +1,5 @@
+3
+a/A
+b/B
+c/C
+
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.good
@ -0,0 +1,37 @@
+aa
+aaa
+aaaa
+ab
+aab
+aaab
+aaaab
+abb
+aabb
+aaabbb
+bb
+bbb
+bbbb
+aaab
+abc
+abcc
+abbc
+abbcc
+aabc
+aabcc
+aabbc
+aabbcc
+aaabbbccc
+ac
+acc
+aac
+aacc
+aaaccc
+bc
+bcc
+bbc
+bbcc
+bbbccc
+cc
+ccc
+cccccc
+abcc
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.wrong
@ -0,0 +1,8 @@
+ba
+aaabaaa
+bbaaa
+aaaaba
+bbbbbaa
+cba
+cab
+acb
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.aff
@ -0,0 +1,3 @@
+COMPOUNDMIN 1
+COMPOUNDRULE 1
+COMPOUNDRULE A?B?C?
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.dic
@ -0,0 +1,5 @@
+3
+a/A
+b/B
+c/C
+
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.good
@ -0,0 +1,7 @@
+a
+b
+c
+ab
+abc
+ac
+bc
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.wrong
@ -0,0 +1,41 @@
+aa
+aaa
+aaaa
+aab
+aaab
+aaaab
+abb
+aabb
+aaabbb
+bb
+bbb
+bbbb
+aaab
+abcc
+abbc
+abbcc
+aabc
+aabcc
+aabbc
+aabbcc
+aaabbbccc
+acc
+aac
+aacc
+aaaccc
+bcc
+bbc
+bbcc
+bbbccc
+cc
+ccc
+cccccc
+abcc
+ba
+aaabaaa
+bbaaa
+aaaaba
+bbbbbaa
+cba
+cab
+acb
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff
@ -0,0 +1,7 @@
+# English ordinal numbers
+WORDCHARS 0123456789
+COMPOUNDMIN 1
+ONLYINCOMPOUND c
+COMPOUNDRULE 2
+COMPOUNDRULE n*1t
+COMPOUNDRULE n*mp
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic
@ -0,0 +1,24 @@
+22
+0/nm
+1/n1
+2/nm
+3/nm
+4/nm
+5/nm
+6/nm
+7/nm
+8/nm
+9/nm
+0th/pt
+1st/p
+1th/tc
+2nd/p
+2th/tc
+3rd/p
+3th/tc
+4th/pt
+5th/pt
+6th/pt
+7th/pt
+8th/pt
+9th/pt
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good
@ -0,0 +1,31 @@
+1st
+2nd
+3rd
+4th
+5th
+6th
+7th
+8th
+9th
+10th
+11th
+12th
+13th
+14th
+15th
+16th
+17th
+18th
+19th
+20th
+21st
+22nd
+23rd
+24th
+25th
+100th
+1000th
+10001st
+10011th
+1ST
+42ND
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.wrong
@ -0,0 +1,5 @@
+1th
+2th
+3th
+10001th
+10011st
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.aff
@ -0,0 +1,7 @@
+# number + percent
+SET UTF-8
+COMPOUNDMIN 1
+COMPOUNDRULE 2
+COMPOUNDRULE N*%?
+COMPOUNDRULE NN*.NN*%?
+WORDCHARS 0123456789‰.
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.dic
@ -0,0 +1,14 @@
+13
+0/N	po:num
+1/N	po:num
+2/N	po:num
+3/N	po:num
+4/N	po:num
+5/N	po:num
+6/N	po:num
+7/N	po:num
+8/N	po:num
+9/N	po:num
+./.	po:sign_dot
+%/%	po:sign_percent
+‰/%	po:sign_per_mille
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.good
@ -0,0 +1,7 @@
+10%
+0.2%
+0.20%
+123.4561‰
+10
+0000
+10.25
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.wrong
@ -0,0 +1 @@
+.25
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.aff
@ -0,0 +1,4 @@
+COMPOUNDMIN 1
+COMPOUNDRULE 2
+COMPOUNDRULE A*A
+COMPOUNDRULE A*AAB*BBBC*C
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.dic
@ -0,0 +1,5 @@
+3
+a/A
+b/B
+c/C
+
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.good
@ -0,0 +1,4 @@
+aa
+aaaaaa
+aabbbc
+aaaaabbbbbbcccccc
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.wrong
@ -0,0 +1,4 @@
+abc
+abbbbbccccccc
+aabbccccccc
+aabbbbbbb
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.aff
@ -0,0 +1,8 @@
+# English ordinal numbers (parenthesized long flags)
+FLAG long
+WORDCHARS 0123456789
+COMPOUNDMIN 1
+ONLYINCOMPOUND cc
+COMPOUNDRULE 2
+COMPOUNDRULE (nn)*(11)(tt)
+COMPOUNDRULE (nn)*(mm)(pp)
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.dic
@ -0,0 +1,24 @@
+22
+0/nnmm
+1/nn11
+2/nnmm
+3/nnmm
+4/nnmm
+5/nnmm
+6/nnmm
+7/nnmm
+8/nnmm
+9/nnmm
+0th/pptt
+1st/pp
+1th/ttcc
+2nd/pp
+2th/ttcc
+3rd/pp
+3th/ttcc
+4th/pptt
+5th/pptt
+6th/pptt
+7th/pptt
+8th/pptt
+9th/pptt
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.good
@ -0,0 +1,29 @@
+1st
+2nd
+3rd
+4th
+5th
+6th
+7th
+8th
+9th
+10th
+11th
+12th
+13th
+14th
+15th
+16th
+17th
+18th
+19th
+20th
+21st
+22nd
+23rd
+24th
+25th
+100th
+1000th
+10001st
+10011th
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.wrong
@ -0,0 +1,5 @@
+1th
+2th
+3th
+10001th
+10011st
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.aff
@ -0,0 +1,8 @@
+# English ordinal numbers (parenthesized numerical flags)
+FLAG num
+WORDCHARS 0123456789
+COMPOUNDMIN 1
+ONLYINCOMPOUND 1000
+COMPOUNDRULE 2
+COMPOUNDRULE (1001)*(1002)(2001)
+COMPOUNDRULE (1001)*(2002)(2000)
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.dic
@ -0,0 +1,24 @@
+22
+0/1001,2002
+1/1001,1002
+2/1001,2002
+3/1001,2002
+4/1001,2002
+5/1001,2002
+6/1001,2002
+7/1001,2002
+8/1001,2002
+9/1001,2002
+0th/2000,2001
+1st/2000
+1th/2001,1000
+2nd/2000
+2th/2001,1000
+3rd/2000
+3th/2001,1000
+4th/2000,2001
+5th/2000,2001
+6th/2000,2001
+7th/2000,2001
+8th/2000,2001
+9th/2000,2001
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.good
@ -0,0 +1,29 @@
+1st
+2nd
+3rd
+4th
+5th
+6th
+7th
+8th
+9th
+10th
+11th
+12th
+13th
+14th
+15th
+16th
+17th
+18th
+19th
+20th
+21st
+22nd
+23rd
+24th
+25th
+100th
+1000th
+10001st
+10011th
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.wrong
@ -0,0 +1,5 @@
+1th
+2th
+3th
+10001th
+10011st