From a79f641561923d8314519962410bc871d9f79add Mon Sep 17 00:00:00 2001
From: Peter Gromov <peter@jetbrains.com>
Date: Wed, 3 Feb 2021 17:45:56 +0100
Subject: [PATCH] LUCENE-9720: Hunspell: more ways to vary misspelled word
 variations for suggestions (#2286)

---
 .../lucene/analysis/hunspell/Dictionary.java  |   6 +
 .../analysis/hunspell/ModifyingSuggester.java | 187 +++++++++++++++++-
 .../analysis/hunspell/SpellChecker.java       |   5 +-
 .../lucene/analysis/hunspell/WordCase.java    |   4 +
 .../apache/lucene/analysis/hunspell/IJ.sug    |   1 +
 .../analysis/hunspell/SpellCheckerTest.java   |   8 +
 .../apache/lucene/analysis/hunspell/sug.aff   |  22 +++
 .../apache/lucene/analysis/hunspell/sug.dic   |  15 ++
 .../apache/lucene/analysis/hunspell/sug.sug   |  15 ++
 .../apache/lucene/analysis/hunspell/sug.wrong |  15 ++
 .../apache/lucene/analysis/hunspell/sug2.aff  |  25 +++
 .../apache/lucene/analysis/hunspell/sug2.dic  |  12 ++
 .../apache/lucene/analysis/hunspell/sug2.sug  |   3 +
 .../lucene/analysis/hunspell/sug2.wrong       |   3 +
 14 files changed, 318 insertions(+), 3 deletions(-)
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/IJ.sug
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.aff
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.dic
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.sug
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.wrong
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.aff
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.dic
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.sug
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.wrong
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 47c57a3bd4c..7b0bd5fb40c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -152,6 +152,8 @@ public class Dictionary {
   private char[] ignore;
 
   String tryChars = "";
+  String[] neighborKeyGroups = new String[0];
+  boolean enableSplitSuggestions = true;
   List<RepEntry> repTable = new ArrayList<>();
 
   // FSTs used for ICONV/OCONV, output ord pointing to replacement text
@@ -385,6 +387,10 @@ public class Dictionary {
           String[] parts = splitBySpace(reader, reader.readLine(), 3);
           repTable.add(new RepEntry(parts[1], parts[2]));
         }
+      } else if ("KEY".equals(firstWord)) {
+        neighborKeyGroups = singleArgument(reader, line).split("\\|");
+      } else if ("NOSPLITSUGS".equals(firstWord)) {
+        enableSplitSuggestions = false;
       } else if ("FORBIDDENWORD".equals(firstWord)) {
         forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
       } else if ("COMPOUNDMIN".equals(firstWord)) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
index 02fa0b47701..4dd91c09b05 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@@ -18,8 +18,10 @@ package org.apache.lucene.analysis.hunspell;
 
 import java.util.Arrays;
 import java.util.LinkedHashSet;
+import java.util.Locale;
 
 class ModifyingSuggester {
+  private static final int MAX_CHAR_DISTANCE = 4;
   private final LinkedHashSet<String> result = new LinkedHashSet<>();
   private final char[] tryChars;
   private final SpellChecker speller;
@@ -30,11 +32,54 @@ class ModifyingSuggester {
   }
 
   LinkedHashSet<String> suggest(String word) {
-    tryRep(word);
-    tryAddingChar(word);
+    tryVariationsOf(word);
+
+    WordCase wc = WordCase.caseOf(word);
+
+    if (wc == WordCase.MIXED) {
+      int dot = word.indexOf('.');
+      if (dot > 0
+          && dot < word.length() - 1
+          && WordCase.caseOf(word.substring(dot + 1)) == WordCase.TITLE) {
+        result.add(word.substring(0, dot + 1) + " " + word.substring(dot + 1));
+      }
+
+      tryVariationsOf(toLowerCase(word));
+    }
+
     return result;
   }
 
+  private String toLowerCase(String word) {
+    char[] chars = new char[word.length()];
+    for (int i = 0; i < word.length(); i++) {
+      chars[i] = speller.dictionary.caseFold(word.charAt(i));
+    }
+    return new String(chars);
+  }
+
+  private void tryVariationsOf(String word) {
+    trySuggestion(word.toUpperCase(Locale.ROOT));
+    if (checkDictionaryForSplitSuggestions(word)) {
+      return;
+    }
+
+    tryRep(word);
+
+    trySwappingChars(word);
+    tryLongSwap(word);
+    tryNeighborKeys(word);
+    tryRemovingChar(word);
+    tryAddingChar(word);
+    tryMovingChar(word);
+    tryReplacingChar(word);
+    tryTwoDuplicateChars(word);
+
+    if (speller.dictionary.enableSplitSuggestions) {
+      trySplitting(word);
+    }
+  }
+
   private void tryRep(String word) {
     for (RepEntry entry : speller.dictionary.repTable) {
       for (String candidate : entry.substitute(word)) {
@@ -50,6 +95,75 @@ class ModifyingSuggester {
     }
   }
 
+  private void trySwappingChars(String word) {
+    int length = word.length();
+    for (int i = 0; i < length - 1; i++) {
+      char c1 = word.charAt(i);
+      char c2 = word.charAt(i + 1);
+      trySuggestion(word.substring(0, i) + c2 + c1 + word.substring(i + 2));
+    }
+
+    if (length == 4 || length == 5) {
+      tryDoubleSwapForShortWords(word, length);
+    }
+  }
+
+  // ahev -> have, owudl -> would
+  private void tryDoubleSwapForShortWords(String word, int length) {
+    char[] candidate = word.toCharArray();
+    candidate[0] = word.charAt(1);
+    candidate[1] = word.charAt(0);
+    candidate[length - 1] = word.charAt(length - 2);
+    candidate[length - 2] = word.charAt(length - 1);
+    trySuggestion(new String(candidate));
+
+    if (candidate.length == 5) {
+      candidate[0] = word.charAt(0);
+      candidate[1] = word.charAt(2);
+      candidate[2] = word.charAt(1);
+      trySuggestion(new String(candidate));
+    }
+  }
+
+  private void tryNeighborKeys(String word) {
+    for (int i = 0; i < word.length(); i++) {
+      char c = word.charAt(i);
+      char up = Character.toUpperCase(c);
+      if (up != c) {
+        trySuggestion(word.substring(0, i) + up + word.substring(i + 1));
+      }
+
+      // check neighbor characters in keyboard string
+      for (String group : speller.dictionary.neighborKeyGroups) {
+        if (group.indexOf(c) >= 0) {
+          for (int j = 0; j < group.length(); j++) {
+            if (group.charAt(j) != c) {
+              trySuggestion(word.substring(0, i) + group.charAt(j) + word.substring(i + 1));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  private void tryLongSwap(String word) {
+    for (int i = 0; i < word.length(); i++) {
+      for (int j = i + 2; j < word.length() && j <= i + MAX_CHAR_DISTANCE; j++) {
+        char c1 = word.charAt(i);
+        char c2 = word.charAt(j);
+        String prefix = word.substring(0, i);
+        String suffix = word.substring(j + 1);
+        trySuggestion(prefix + c2 + word.substring(i + 1, j) + c1 + suffix);
+      }
+    }
+  }
+
+  private void tryRemovingChar(String word) {
+    for (int i = 0; i < word.length(); i++) {
+      trySuggestion(word.substring(0, i) + word.substring(i + 1));
+    }
+  }
+
   private void tryAddingChar(String word) {
     for (int i = 0; i <= word.length(); i++) {
       String prefix = word.substring(0, i);
@@ -60,6 +174,75 @@ class ModifyingSuggester {
     }
   }
 
+  private void tryMovingChar(String word) {
+    for (int i = 0; i < word.length(); i++) {
+      for (int j = i + 2; j < word.length() && j <= i + MAX_CHAR_DISTANCE; j++) {
+        String prefix = word.substring(0, i);
+        trySuggestion(prefix + word.substring(i + 1, j) + word.charAt(i) + word.substring(j));
+        trySuggestion(prefix + word.charAt(j) + word.substring(i, j) + word.substring(j + 1));
+      }
+    }
+  }
+
+  private void tryReplacingChar(String word) {
+    for (int i = 0; i < word.length(); i++) {
+      String prefix = word.substring(0, i);
+      String suffix = word.substring(i + 1);
+      for (char toInsert : tryChars) {
+        if (toInsert != word.charAt(i)) {
+          trySuggestion(prefix + toInsert + suffix);
+        }
+      }
+    }
+  }
+
+  // perhaps we doubled two characters
+  // (for example vacation -> vacacation)
+  private void tryTwoDuplicateChars(String word) {
+    int dupLen = 0;
+    for (int i = 2; i < word.length(); i++) {
+      if (word.charAt(i) == word.charAt(i - 2)) {
+        dupLen++;
+        if (dupLen == 3 || dupLen == 2 && i >= 4) {
+          trySuggestion(word.substring(0, i - 1) + word.substring(i + 1));
+          dupLen = 0;
+        }
+      } else {
+        dupLen = 0;
+      }
+    }
+  }
+
+  private boolean checkDictionaryForSplitSuggestions(String word) {
+    boolean found = false;
+    for (int i = 1; i < word.length() - 1; i++) {
+      String w1 = word.substring(0, i);
+      String w2 = word.substring(i);
+      found |= trySuggestion(w1 + " " + w2);
+      if (shouldSplitByDash()) {
+        found |= trySuggestion(w1 + "-" + w2);
+      }
+    }
+    return found;
+  }
+
+  private void trySplitting(String word) {
+    for (int i = 1; i < word.length() - 1; i++) {
+      String w1 = word.substring(0, i);
+      String w2 = word.substring(i);
+      if (speller.checkWord(w1) && speller.checkWord(w2)) {
+        result.add(w1 + " " + w2);
+        if (shouldSplitByDash()) {
+          result.add(w1 + "-" + w2);
+        }
+      }
+    }
+  }
+
+  private boolean shouldSplitByDash() {
+    return speller.dictionary.tryChars.contains("-") || speller.dictionary.tryChars.contains("a");
+  }
+
   private boolean trySuggestion(String candidate) {
     if (speller.checkWord(candidate)) {
       result.add(candidate);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
index 747b209fa32..d69940c0562 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@@ -414,7 +414,10 @@ public class SpellChecker {
         String chunk = word.substring(chunkStart, chunkEnd);
         if (!spell(chunk)) {
           for (String chunkSug : suggest(chunk)) {
-            result.add(word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd));
+            String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd);
+            if (!dictionary.isForbiddenWord(replaced.toCharArray(), replaced.length(), scratch)) {
+              result.add(replaced);
+            }
           }
         }
       }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
index 01fffd914c0..1499ee46ca0 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
@@ -37,6 +37,10 @@ enum WordCase {
     return get(startsWithLower, seenUpper, seenLower);
   }
 
+  static WordCase caseOf(CharSequence word) {
+    return caseOf(word, word.length());
+  }
+
   static WordCase caseOf(CharSequence word, int length) {
     boolean startsWithLower = Character.isLowerCase(word.charAt(0));
 
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/IJ.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/IJ.sug
new file mode 100644
index 00000000000..582b7956b5f
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/IJ.sug
@@ -0,0 +1 @@
+IJs, ijs
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
index 49514ae6d8b..eedef38217e 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -156,6 +156,14 @@ public class SpellCheckerTest extends StemmerTestBase {
     doTest("germancompounding");
   }
 
+  public void testModifyingSuggestions() throws Exception {
+    doTest("sug");
+  }
+
+  public void testModifyingSuggestions2() throws Exception {
+    doTest("sug2");
+  }
+
   protected void doTest(String name) throws Exception {
     checkSpellCheckerExpectations(
         Path.of(getClass().getResource(name + ".aff").toURI()).getParent().resolve(name), true);
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.aff
new file mode 100644
index 00000000000..8f150cdf4cd
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.aff
@@ -0,0 +1,22 @@
+# new suggestion methods of Hunspell 1.5:
+# capitalization: nasa -> NASA
+# long swap: permenant -> permanent
+# long mov: Ghandi -> Gandhi
+# double two characters: vacacation -> vacation
+# space with REP: "alot" -> "a lot" ("a lot" need to be in the dic file.)
+#
+# Note: see test "ph" for the newer and
+# more simple method to handle common misspellings,
+# for example, alot->a lot, inspite->in spite,
+# (that is giving the best suggestion, and limiting
+# ngram/phonetic suggestion)
+
+# switch off ngram suggestion for testing
+MAXNGRAMSUGS 0
+REP 2
+REP alot a_lot
+REP inspite in_spite
+KEY qwertzuiop|asdfghjkl|yxcvbnm|aq
+WORDCHARS .-
+FORBIDDENWORD ?
+
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.dic
new file mode 100644
index 00000000000..1d019ceb401
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.dic
@@ -0,0 +1,15 @@
+13
+NASA
+Gandhi
+grateful
+permanent
+vacation
+a
+lot
+have
+which
+McDonald
+permanent-vacation/?
+in
+spite
+inspire
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.sug
new file mode 100644
index 00000000000..bea54b8f02f
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.sug
@@ -0,0 +1,15 @@
+NASA
+Gandhi
+grateful
+permanent
+vacation
+a lot, lot
+in spite, inspire
+permanent. Vacation
+have
+which
+Gandhi
+McDonald
+permanent
+
+
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.wrong
new file mode 100644
index 00000000000..0093de893ab
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.wrong
@@ -0,0 +1,15 @@
+nasa
+Ghandi
+greatful
+permenant
+vacacation
+alot
+inspite
+permanent.Vacation
+ahev
+hwihc
+GAndhi
+Mcdonald
+permqnent
+permanent-vacation
+permqnent-vacation
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.aff
new file mode 100644
index 00000000000..bb7c8803df2
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.aff
@@ -0,0 +1,25 @@
+# new suggestion methods of Hunspell 1.7:
+# dictionary word pairs with spaces or dashes
+# got top priority, and removes other not
+# "good" (uppercase, REP, ph:) suggestions:
+#
+# "alot" -> "a lot"
+#
+# Note: use ph: at the dictionary word pair
+# with space or dash to keep the other not
+# "good" suggestions, for example
+#
+# a lot ph:alot
+#
+# results "alot" -> "a lot", "alto", "slot"...
+
+# switch off ngram suggestion for testing
+MAXNGRAMSUGS 0
+KEY qwertzuiop|asdfghjkl|yxcvbnm|aq
+
+# Note: TRY with a letter "a" or "-" needs for
+# checking dictionary word pairs with dashes
+TRY esianrtolcdugmphbyfvkwz'
+WORDCHARS .-
+FORBIDDENWORD ?
+
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.dic
new file mode 100644
index 00000000000..86311a9e9d5
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.dic
@@ -0,0 +1,12 @@
+11
+a
+lot
+a lot
+alto
+in
+spite
+in spite
+inspire
+scot
+free
+scot-free
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.sug
new file mode 100644
index 00000000000..65b7537d754
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.sug
@@ -0,0 +1,3 @@
+a lot
+in spite
+scot-free
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.wrong
new file mode 100644
index 00000000000..4cfc5697c22
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.wrong
@@ -0,0 +1,3 @@
+alot
+inspite
+scotfree