LUCENE-9710: Hunspell: support minor compounding-related flags (#2272)

* LUCENE-9710: Hunspell: support COMPOUNDFLAG * LUCENE-9710: Hunspell: fix CHECKCOMPOUNDCASE support * LUCENE-9710: Hunspell: support CHECKCOMPOUNDDUP * LUCENE-9710: Hunspell: support triple flags (CHECKCOMPOUNDTRIPLE, SIMPLIFIEDTRIPLE) * LUCENE-9710: Hunspell: support COMPOUNDFORBIDFLAG * LUCENE-9710: Hunspell: support FORCEUCASE
2021-02-01 10:20:11 +01:00 · 2021-02-01 10:20:11 +01:00 · 9d45dfe776
parent 40e92315ae
commit 9d45dfe776
32 changed files with 247 additions and 21 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -139,11 +139,13 @@ public class Dictionary {
  boolean twoStageAffix;

  char circumfix;
-  char keepcase;
+  char keepcase, forceUCase;
  char needaffix;
  char forbiddenword;
-  char onlyincompound, compoundBegin, compoundMiddle, compoundEnd, compoundPermit;
-  boolean checkCompoundCase;
+  char onlyincompound, compoundBegin, compoundMiddle, compoundEnd, compoundFlag;
+  char compoundPermit, compoundForbid;
+  boolean checkCompoundCase, checkCompoundDup;
+  boolean checkCompoundTriple, simplifiedTriple;
  int compoundMin = 3, compoundMax = Integer.MAX_VALUE;
  List<CompoundRule> compoundRules; // nullable

@ -350,6 +352,8 @@ public class Dictionary {
        circumfix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
      } else if ("KEEPCASE".equals(firstWord)) {
        keepcase = flagParsingStrategy.parseFlag(singleArgument(reader, line));
+      } else if ("FORCEUCASE".equals(firstWord)) {
+        forceUCase = flagParsingStrategy.parseFlag(singleArgument(reader, line));
      } else if ("NEEDAFFIX".equals(firstWord) || "PSEUDOROOT".equals(firstWord)) {
        needaffix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
      } else if ("ONLYINCOMPOUND".equals(firstWord)) {
@ -387,6 +391,8 @@ public class Dictionary {
        compoundMax = Math.max(1, Integer.parseInt(singleArgument(reader, line)));
      } else if ("COMPOUNDRULE".equals(firstWord)) {
        compoundRules = parseCompoundRules(reader, Integer.parseInt(singleArgument(reader, line)));
+      } else if ("COMPOUNDFLAG".equals(firstWord)) {
+        compoundFlag = flagParsingStrategy.parseFlag(singleArgument(reader, line));
      } else if ("COMPOUNDBEGIN".equals(firstWord)) {
        compoundBegin = flagParsingStrategy.parseFlag(singleArgument(reader, line));
      } else if ("COMPOUNDMIDDLE".equals(firstWord)) {
@ -395,8 +401,16 @@ public class Dictionary {
        compoundEnd = flagParsingStrategy.parseFlag(singleArgument(reader, line));
      } else if ("COMPOUNDPERMITFLAG".equals(firstWord)) {
        compoundPermit = flagParsingStrategy.parseFlag(singleArgument(reader, line));
+      } else if ("COMPOUNDFORBIDFLAG".equals(firstWord)) {
+        compoundForbid = flagParsingStrategy.parseFlag(singleArgument(reader, line));
      } else if ("CHECKCOMPOUNDCASE".equals(firstWord)) {
        checkCompoundCase = true;
+      } else if ("CHECKCOMPOUNDDUP".equals(firstWord)) {
+        checkCompoundDup = true;
+      } else if ("CHECKCOMPOUNDTRIPLE".equals(firstWord)) {
+        checkCompoundTriple = true;
+      } else if ("SIMPLIFIEDTRIPLE".equals(firstWord)) {
+        simplifiedTriple = true;
      }
    }

--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@ -16,9 +16,16 @@
 */
 package org.apache.lucene.analysis.hunspell;

+import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
+import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
+import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
+import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
+import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
+
 import java.util.ArrayList;
 import java.util.List;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;

 /**
@ -126,7 +133,7 @@ public class SpellChecker {
      return false;
    }

-    if (hasStems(wordChars, 0, length, originalCase, WordContext.SIMPLE_WORD)) {
+    if (!stemmer.doStem(wordChars, 0, length, originalCase, SIMPLE_WORD).isEmpty()) {
      return true;
    }

@ -135,12 +142,11 @@ public class SpellChecker {
      return true;
    }

-    return dictionary.compoundBegin > 0 && checkCompounds(wordChars, 0, length, originalCase, 0);
-  }
+    if (dictionary.compoundBegin != FLAG_UNSET || dictionary.compoundFlag != FLAG_UNSET) {
+      return checkCompounds(wordChars, 0, length, originalCase, 0);
+    }

-  private boolean hasStems(
-      char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
-    return !stemmer.doStem(chars, offset, length, originalCase, context).isEmpty();
+    return false;
  }

  private boolean checkCompounds(
@ -149,12 +155,23 @@ public class SpellChecker {

    int limit = length - dictionary.compoundMin + 1;
    for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
-      WordContext context = depth == 0 ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_MIDDLE;
+      WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
      int breakOffset = offset + breakPos;
-      if (checkCompoundCase(chars, breakOffset)
-          && hasStems(chars, offset, breakPos, originalCase, context)) {
+      if (mayBreakIntoCompounds(chars, offset, length, breakOffset)) {
+        List<CharsRef> stems = stemmer.doStem(chars, offset, breakPos, originalCase, context);
+        if (stems.isEmpty()
+            && dictionary.simplifiedTriple
+            && chars[breakOffset - 1] == chars[breakOffset]) {
+          stems = stemmer.doStem(chars, offset, breakPos + 1, originalCase, context);
+        }
+        if (stems.isEmpty()) continue;
+
        int remainingLength = length - breakPos;
-        if (hasStems(chars, breakOffset, remainingLength, originalCase, WordContext.COMPOUND_END)) {
+        List<CharsRef> lastStems =
+            stemmer.doStem(chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
+        if (!lastStems.isEmpty()
+            && !(dictionary.checkCompoundDup && intersectIgnoreCase(stems, lastStems))
+            && !hasForceUCaseProblem(chars, breakOffset, remainingLength, originalCase)) {
          return true;
        }

@ -167,9 +184,37 @@ public class SpellChecker {
    return false;
  }

-  private boolean checkCompoundCase(char[] chars, int breakPos) {
-    if (!dictionary.checkCompoundCase) return true;
-    return Character.isUpperCase(chars[breakPos - 1]) == Character.isUpperCase(chars[breakPos]);
+  private boolean hasForceUCaseProblem(
+      char[] chars, int offset, int length, WordCase originalCase) {
+    if (dictionary.forceUCase == FLAG_UNSET) return false;
+    if (originalCase == WordCase.TITLE || originalCase == WordCase.UPPER) return false;
+
+    IntsRef forms = dictionary.lookupWord(chars, offset, length);
+    return forms != null && dictionary.hasFlag(forms, dictionary.forceUCase, scratch);
+  }
+
+  private boolean intersectIgnoreCase(List<CharsRef> stems1, List<CharsRef> stems2) {
+    return stems1.stream().anyMatch(s1 -> stems2.stream().anyMatch(s2 -> equalsIgnoreCase(s1, s2)));
+  }
+
+  private boolean equalsIgnoreCase(CharsRef cr1, CharsRef cr2) {
+    return cr1.toString().equalsIgnoreCase(cr2.toString());
+  }
+
+  private boolean mayBreakIntoCompounds(char[] chars, int offset, int length, int breakPos) {
+    if (dictionary.checkCompoundCase) {
+      if (Character.isUpperCase(chars[breakPos - 1]) || Character.isUpperCase(chars[breakPos])) {
+        return false;
+      }
+    }
+    if (dictionary.checkCompoundTriple && chars[breakPos - 1] == chars[breakPos]) {
+      //noinspection RedundantIfStatement
+      if (breakPos > offset + 1 && chars[breakPos - 2] == chars[breakPos - 1]
+          || breakPos < length - 1 && chars[breakPos] == chars[breakPos + 1]) {
+        return false;
+      }
+    }
+    return true;
  }

  private boolean checkCompoundRules(
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -238,9 +238,15 @@ final class Stemmer {
        if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
          continue;
        }
-        if (context.isCompound()
-            && !Dictionary.hasFlag(wordFlags, context.requiredFlag(dictionary))) {
-          continue;
+        if (context.isCompound()) {
+          if (context != WordContext.COMPOUND_END
+              && Dictionary.hasFlag(wordFlags, dictionary.compoundForbid)) {
+            return new ArrayList<>();
+          }
+          if (!Dictionary.hasFlag(wordFlags, dictionary.compoundFlag)
+              && !Dictionary.hasFlag(wordFlags, context.requiredFlag(dictionary))) {
+            continue;
+          }
        }
        stems.add(newStem(word, offset, length, forms, i));
      }
@ -554,7 +560,10 @@ final class Stemmer {
      WordContext context) {
    int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);

-    if (context.isCompound() && dictionary.compoundPermit > 0) {
+    if (context.isCompound()) {
+      if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid, scratch)) {
+        return false;
+      }
      WordContext allowed = isPrefix ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_END;
      if (context != allowed && !dictionary.hasFlag(append, dictionary.compoundPermit, scratch)) {
        return false;
@ -672,7 +681,10 @@ final class Stemmer {
          }
          if (context.isCompound()) {
            char cFlag = context.requiredFlag(dictionary);
-            if (!Dictionary.hasFlag(wordFlags, cFlag) && !isFlagAppendedByAffix(affix, cFlag)) {
+            if (!Dictionary.hasFlag(wordFlags, cFlag)
+                && !isFlagAppendedByAffix(affix, cFlag)
+                && !Dictionary.hasFlag(wordFlags, dictionary.compoundFlag)
+                && !isFlagAppendedByAffix(affix, dictionary.compoundFlag)) {
              continue;
            }
          }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@ -46,6 +46,11 @@ public class SpellCheckerTest extends StemmerTestBase {
    doTest("allcaps");
  }

+  @Test
+  public void forceUCase() throws Exception {
+    doTest("forceucase");
+  }
+
  @Test
  public void checkSharpS() throws Exception {
    doTest("checksharps");
@ -71,6 +76,36 @@ public class SpellCheckerTest extends StemmerTestBase {
    doTest("needaffix5");
  }

+  @Test
+  public void compoundFlag() throws Exception {
+    doTest("compoundflag");
+  }
+
+  @Test
+  public void checkCompoundCase() throws Exception {
+    doTest("checkcompoundcase");
+  }
+
+  @Test
+  public void checkCompoundDup() throws Exception {
+    doTest("checkcompounddup");
+  }
+
+  @Test
+  public void checkCompoundTriple() throws Exception {
+    doTest("checkcompoundtriple");
+  }
+
+  @Test
+  public void simplifiedTriple() throws Exception {
+    doTest("simplifiedtriple");
+  }
+
+  @Test
+  public void compoundForbid() throws Exception {
+    doTest("compoundforbid");
+  }
+
  public void testBreak() throws Exception {
    doTest("break");
  }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundcase.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundcase.aff
@ -0,0 +1,3 @@
+# forbid upper case letters at word bounds in compounding
+CHECKCOMPOUNDCASE
+COMPOUNDFLAG A
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundcase.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundcase.dic
@ -0,0 +1,5 @@
+4
+foo/A
+Bar/A
+BAZ/A
+-/A
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundcase.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundcase.good
@ -0,0 +1,5 @@
+Barfoo
+foo-Bar
+foo-BAZ
+BAZ-foo
+BAZ-Bar
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundcase.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundcase.wrong
@ -0,0 +1,3 @@
+fooBar
+BAZBar
+BAZfoo
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompounddup.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompounddup.aff
@ -0,0 +1,3 @@
+# Forbid compound word with triple letters
+CHECKCOMPOUNDDUP
+COMPOUNDFLAG A
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompounddup.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompounddup.dic
@ -0,0 +1,3 @@
+2
+foo/A
+bar/A
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompounddup.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompounddup.good
@ -0,0 +1,5 @@
+barfoo
+foobar
+foofoobar
+foobarfoo
+barfoobarfoo
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompounddup.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompounddup.wrong
@ -0,0 +1,3 @@
+foofoo
+foofoofoo
+foobarbar
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundtriple.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundtriple.aff
@ -0,0 +1,3 @@
+# Forbid compound word with triple letters
+CHECKCOMPOUNDTRIPLE
+COMPOUNDFLAG A
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundtriple.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundtriple.dic
@ -0,0 +1,5 @@
+4
+foo/A
+opera/A
+eel/A
+bare/A
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundtriple.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundtriple.good
@ -0,0 +1,6 @@
+operafoo
+operaeel
+operabare
+eelbare
+eelfoo
+eelopera
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundtriple.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundtriple.wrong
@ -0,0 +1,2 @@
+fooopera
+bareeel
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundflag.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundflag.aff
@ -0,0 +1,3 @@
+COMPOUNDMIN 3
+COMPOUNDFLAG A
+
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundflag.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundflag.dic
@ -0,0 +1,5 @@
+4
+foo/A
+bar/A
+xy/A
+yz/A
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundflag.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundflag.good
@ -0,0 +1,3 @@
+foobar
+barfoo
+foobarfoo
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundflag.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundflag.wrong
@ -0,0 +1,4 @@
+xyyz
+fooxy
+xyfoo
+fooxybar
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundforbid.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundforbid.aff
@ -0,0 +1,15 @@
+# Dictionary words with COMPOUNDFORBIDFLAG are
+# removed from the beginning and middle of
+# compound words, overriding the effect of
+# COMPOUNDPERMITFLAG.
+#
+# See compoundaffix3 test for basic usage
+# of COMPOUNDFORBIDFLAG.
+
+COMPOUNDFLAG X
+COMPOUNDPERMITFLAG Y
+COMPOUNDFORBIDFLAG Z
+
+SFX S Y 2
+SFX S   0     bar/YX         .
+SFX S   0     baz/YX         .
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundforbid.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundforbid.dic
@ -0,0 +1,4 @@
+3
+foo/S
+example/X
+foobaz/Z
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundforbid.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundforbid.good
@ -0,0 +1,5 @@
+foo
+example
+foobar
+foobaz
+foobarexample
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundforbid.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundforbid.wrong
@ -0,0 +1,3 @@
+fooexample
+examplefoo
+foobazexample
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forceucase.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forceucase.aff
@ -0,0 +1,4 @@
+# force capitalized compound
+TRY F
+FORCEUCASE A
+COMPOUNDFLAG C
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forceucase.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forceucase.dic
@ -0,0 +1,4 @@
+3
+foo/C
+bar/C
+baz/CA
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forceucase.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forceucase.good
@ -0,0 +1,7 @@
+foo
+bar
+baz
+foobar
+Foobaz
+foobazbar
+Foobarbaz
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forceucase.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forceucase.wrong
@ -0,0 +1,2 @@
+foobaz
+foobarbaz
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simplifiedtriple.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simplifiedtriple.aff
@ -0,0 +1,8 @@
+# Forbid compound word with triple letters
+CHECKCOMPOUNDTRIPLE
+# Allow simplified forms
+SIMPLIFIEDTRIPLE
+
+COMPOUNDMIN 2
+
+COMPOUNDFLAG A
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simplifiedtriple.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simplifiedtriple.dic
@ -0,0 +1,3 @@
+2
+glass/A
+sko/A
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simplifiedtriple.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simplifiedtriple.good
@ -0,0 +1,3 @@
+glass
+sko
+glassko
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simplifiedtriple.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simplifiedtriple.wrong
@ -0,0 +1 @@
+glasssko