LUCENE-9742: Hunspell: suggest dictionary entries similar to the misspelled word (#2320)

2021-02-09 08:12:34 +01:00 · 2021-02-09 08:12:34 +01:00 · 24984ff4e2
parent e186d8c439
commit 24984ff4e2
7 changed files with 397 additions and 25 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -169,6 +169,9 @@ public class Dictionary {
  boolean enableSplitSuggestions = true;
  List<RepEntry> repTable = new ArrayList<>();
  List<List<String>> mapTable = new ArrayList<>();
+  int maxDiff = 5;
+  int maxNGramSuggestions = Integer.MAX_VALUE;
+  boolean onlyMaxDiff;

  // FSTs used for ICONV/OCONV, output ord pointing to replacement text
  FST<CharsRef> iconv;
@ -409,6 +412,16 @@ public class Dictionary {
        neighborKeyGroups = singleArgument(reader, line).split("\\|");
      } else if ("NOSPLITSUGS".equals(firstWord)) {
        enableSplitSuggestions = false;
+      } else if ("MAXNGRAMSUGS".equals(firstWord)) {
+        maxNGramSuggestions = Integer.parseInt(singleArgument(reader, line));
+      } else if ("MAXDIFF".equals(firstWord)) {
+        int i = Integer.parseInt(singleArgument(reader, line));
+        if (i < 0 || i > 10) {
+          throw new ParseException("MAXDIFF should be between 0 and 10", reader.getLineNumber());
+        }
+        maxDiff = i;
+      } else if ("ONLYMAXDIFF".equals(firstWord)) {
+        onlyMaxDiff = true;
      } else if ("FORBIDDENWORD".equals(firstWord)) {
        forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
      } else if ("COMPOUNDMIN".equals(firstWord)) {
@ -487,7 +500,7 @@ public class Dictionary {
    return mapEntry;
  }

-  private boolean hasLanguage(String... langCodes) {
+  boolean hasLanguage(String... langCodes) {
    if (language == null) return false;
    String langCode = extractLanguageCode(language);
    for (String code : langCodes) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@ -0,0 +1,320 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.EnumSet;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.stream.Collectors;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.fst.IntsRefFSTEnum;
+
+/**
+ * A class that traverses the entire dictionary and applies affix rules to check if those yield
+ * correct suggestions similar enough to the given misspelled word
+ */
+class GeneratingSuggester {
+  private static final int MAX_ROOTS = 100;
+  private static final int MAX_GUESSES = 100;
+  private final Dictionary dictionary;
+
+  GeneratingSuggester(Dictionary dictionary) {
+    this.dictionary = dictionary;
+  }
+
+  List<String> suggest(String word, WordCase originalCase, Set<String> prevSuggestions) {
+    List<WeightedWord> roots = findSimilarDictionaryEntries(word, originalCase);
+    List<WeightedWord> expanded = expandRoots(word, roots);
+    TreeSet<WeightedWord> bySimilarity = rankBySimilarity(word, expanded);
+    return getMostRelevantSuggestions(bySimilarity, prevSuggestions);
+  }
+
+  private List<WeightedWord> findSimilarDictionaryEntries(String word, WordCase originalCase) {
+    try {
+      IntsRefFSTEnum<IntsRef> fstEnum = new IntsRefFSTEnum<>(dictionary.words);
+      TreeSet<WeightedWord> roots = new TreeSet<>();
+
+      IntsRefFSTEnum.InputOutput<IntsRef> mapping;
+      while ((mapping = fstEnum.next()) != null) {
+        IntsRef key = mapping.input;
+        if (Math.abs(key.length - word.length()) > 4 || !isSuitableRoot(mapping.output)) continue;
+
+        String root = toString(key);
+        if (originalCase == WordCase.LOWER
+            && WordCase.caseOf(root) == WordCase.TITLE
+            && !dictionary.hasLanguage("de")) {
+          continue;
+        }
+
+        String lower = dictionary.toLowerCase(root);
+        int sc =
+            ngram(3, word, lower, EnumSet.of(NGramOptions.LONGER_WORSE)) + commonPrefix(word, root);
+
+        roots.add(new WeightedWord(root, sc));
+      }
+      return roots.stream().limit(MAX_ROOTS).collect(Collectors.toList());
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  private static String toString(IntsRef key) {
+    char[] chars = new char[key.length];
+    for (int i = 0; i < key.length; i++) {
+      chars[i] = (char) key.ints[i + key.offset];
+    }
+    return new String(chars);
+  }
+
+  private boolean isSuitableRoot(IntsRef forms) {
+    for (int i = 0; i < forms.length; i += dictionary.formStep()) {
+      int entryId = forms.ints[forms.offset + i];
+      if (dictionary.hasFlag(entryId, dictionary.needaffix)
+          || dictionary.hasFlag(entryId, dictionary.forbiddenword)
+          || dictionary.hasFlag(entryId, Dictionary.HIDDEN_FLAG)
+          || dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
+        continue;
+      }
+      return true;
+    }
+
+    return false;
+  }
+
+  private List<WeightedWord> expandRoots(String word, List<WeightedWord> roots) {
+    int thresh = calcThreshold(word);
+
+    TreeSet<WeightedWord> expanded = new TreeSet<>();
+    for (WeightedWord weighted : roots) {
+      String guess = weighted.word;
+      String lower = dictionary.toLowerCase(guess);
+      int sc =
+          ngram(word.length(), word, lower, EnumSet.of(NGramOptions.ANY_MISMATCH))
+              + commonPrefix(word, guess);
+      if (sc > thresh) {
+        expanded.add(new WeightedWord(guess, sc));
+      }
+    }
+    return expanded.stream().limit(MAX_GUESSES).collect(Collectors.toList());
+  }
+
+  // find minimum threshold for a passable suggestion
+  // mangle original word three different ways
+  // and score them to generate a minimum acceptable score
+  private static int calcThreshold(String word) {
+    int thresh = 0;
+    for (int sp = 1; sp < 4; sp++) {
+      char[] mw = word.toCharArray();
+      for (int k = sp; k < word.length(); k += 4) {
+        mw[k] = '*';
+      }
+
+      thresh += ngram(word.length(), word, new String(mw), EnumSet.of(NGramOptions.ANY_MISMATCH));
+    }
+    return thresh / 3 - 1;
+  }
+
+  private TreeSet<WeightedWord> rankBySimilarity(String word, List<WeightedWord> expanded) {
+    double fact = (10.0 - dictionary.maxDiff) / 5.0;
+    TreeSet<WeightedWord> bySimilarity = new TreeSet<>();
+    for (WeightedWord weighted : expanded) {
+      String guess = weighted.word;
+      String lower = dictionary.toLowerCase(guess);
+      if (lower.equals(word)) {
+        bySimilarity.add(new WeightedWord(guess, weighted.score + 2000));
+        break;
+      }
+
+      int re =
+          ngram(2, word, lower, EnumSet.of(NGramOptions.ANY_MISMATCH, NGramOptions.WEIGHTED))
+              + ngram(2, lower, word, EnumSet.of(NGramOptions.ANY_MISMATCH, NGramOptions.WEIGHTED));
+
+      int score =
+          2 * lcs(word, lower)
+              - Math.abs(word.length() - lower.length())
+              + commonCharacterPositionScore(word, lower)
+              + commonPrefix(word, lower)
+              + ngram(4, word, lower, EnumSet.of(NGramOptions.ANY_MISMATCH))
+              + re
+              + (re < (word.length() + lower.length()) * fact ? -1000 : 0);
+      bySimilarity.add(new WeightedWord(guess, score));
+    }
+    return bySimilarity;
+  }
+
+  private List<String> getMostRelevantSuggestions(
+      TreeSet<WeightedWord> bySimilarity, Set<String> prevSuggestions) {
+    List<String> result = new ArrayList<>();
+    boolean hasExcellent = false;
+    for (WeightedWord weighted : bySimilarity) {
+      if (weighted.score > 1000) {
+        hasExcellent = true;
+      } else if (hasExcellent) {
+        break; // leave only excellent suggestions, if any
+      }
+
+      boolean bad = weighted.score < -100;
+      // keep the best ngram suggestions, unless in ONLYMAXDIFF mode
+      if (bad && (!result.isEmpty() || dictionary.onlyMaxDiff)) {
+        break;
+      }
+
+      if (prevSuggestions.stream().noneMatch(weighted.word::contains)
+          && result.stream().noneMatch(weighted.word::contains)) {
+        result.add(weighted.word);
+        if (result.size() > dictionary.maxNGramSuggestions) {
+          break;
+        }
+      }
+
+      if (bad) {
+        break;
+      }
+    }
+    return result;
+  }
+
+  private static int commonPrefix(String s1, String s2) {
+    int i = 0;
+    int limit = Math.min(s1.length(), s2.length());
+    while (i < limit && s1.charAt(i) == s2.charAt(i)) {
+      i++;
+    }
+    return i;
+  }
+
+  // generate an n-gram score comparing s1 and s2
+  private static int ngram(int n, String s1, String s2, EnumSet<NGramOptions> opt) {
+    int score = 0;
+    int l1 = s1.length();
+    int l2 = s2.length();
+    if (l2 == 0) {
+      return 0;
+    }
+    for (int j = 1; j <= n; j++) {
+      int ns = 0;
+      for (int i = 0; i <= (l1 - j); i++) {
+        if (s2.contains(s1.substring(i, i + j))) {
+          ns++;
+        } else if (opt.contains(NGramOptions.WEIGHTED)) {
+          ns--;
+          if (i == 0 || i == l1 - j) {
+            ns--; // side weight
+          }
+        }
+      }
+      score = score + ns;
+      if (ns < 2 && !opt.contains(NGramOptions.WEIGHTED)) {
+        break;
+      }
+    }
+
+    int ns = 0;
+    if (opt.contains(NGramOptions.LONGER_WORSE)) {
+      ns = (l2 - l1) - 2;
+    }
+    if (opt.contains(NGramOptions.ANY_MISMATCH)) {
+      ns = Math.abs(l2 - l1) - 2;
+    }
+    return score - Math.max(ns, 0);
+  }
+
+  private static int lcs(String s1, String s2) {
+    int[] lengths = new int[s2.length() + 1];
+
+    for (int i = 1; i <= s1.length(); i++) {
+      int prev = 0;
+      for (int j = 1; j <= s2.length(); j++) {
+        int cur = lengths[j];
+        lengths[j] =
+            s1.charAt(i - 1) == s2.charAt(j - 1) ? prev + 1 : Math.max(cur, lengths[j - 1]);
+        prev = cur;
+      }
+    }
+    return lengths[s2.length()];
+  }
+
+  private static int commonCharacterPositionScore(String s1, String s2) {
+    int num = 0;
+    int diffPos1 = -1;
+    int diffPos2 = -1;
+    int diff = 0;
+    int i;
+    for (i = 0; i < s1.length() && i < s2.length(); ++i) {
+      if (s1.charAt(i) == s2.charAt(i)) {
+        num++;
+      } else {
+        if (diff == 0) diffPos1 = i;
+        else if (diff == 1) diffPos2 = i;
+        diff++;
+      }
+    }
+    int commonScore = num > 0 ? 1 : 0;
+    if (diff == 2
+        && i == s1.length()
+        && i == s2.length()
+        && s1.charAt(diffPos1) == s2.charAt(diffPos2)
+        && s1.charAt(diffPos2) == s2.charAt(diffPos1)) {
+      return commonScore + 10;
+    }
+    return commonScore;
+  }
+
+  private enum NGramOptions {
+    WEIGHTED,
+    LONGER_WORSE,
+    ANY_MISMATCH
+  }
+
+  private static class WeightedWord implements Comparable<WeightedWord> {
+    final String word;
+    final int score;
+
+    WeightedWord(String word, int score) {
+      this.word = word;
+      this.score = score;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      if (this == o) return true;
+      if (!(o instanceof WeightedWord)) return false;
+      WeightedWord that = (WeightedWord) o;
+      return score == that.score && word.equals(that.word);
+    }
+
+    @Override
+    public int hashCode() {
+      return Objects.hash(word, score);
+    }
+
+    @Override
+    public String toString() {
+      return word + "(" + score + ")";
+    }
+
+    @Override
+    public int compareTo(WeightedWord o) {
+      int cmp = Integer.compare(score, o.score);
+      return cmp != 0 ? -cmp : word.compareTo(o.word);
+    }
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@ -21,33 +21,29 @@ import java.util.Arrays;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Locale;
-import java.util.stream.Collectors;

+/** A class that modifies the given misspelled word in various ways to get correct suggestions */
 class ModifyingSuggester {
  private static final int MAX_CHAR_DISTANCE = 4;
  private final LinkedHashSet<String> result = new LinkedHashSet<>();
  private final char[] tryChars;
  private final SpellChecker speller;
+  boolean hasGoodSuggestions;

  ModifyingSuggester(SpellChecker speller) {
    this.speller = speller;
    tryChars = speller.dictionary.tryChars.toCharArray();
  }

-  LinkedHashSet<String> suggest(String word) {
+  LinkedHashSet<String> suggest(String word, WordCase wordCase) {
    tryVariationsOf(word);

-    WordCase wc = WordCase.caseOf(word);
-
-    if (wc == WordCase.UPPER) {
+    if (wordCase == WordCase.TITLE) {
+      tryVariationsOf(speller.dictionary.toLowerCase(word));
+    } else if (wordCase == WordCase.UPPER) {
      tryVariationsOf(speller.dictionary.toLowerCase(word));
      tryVariationsOf(speller.dictionary.toTitleCase(word));
-      return result.stream()
-          .map(this::tryUpperCase)
-          .collect(Collectors.toCollection(LinkedHashSet::new));
-    }
-
-    if (wc == WordCase.MIXED) {
+    } else if (wordCase == WordCase.MIXED) {
      int dot = word.indexOf('.');
      if (dot > 0
          && dot < word.length() - 1
@ -61,17 +57,8 @@ class ModifyingSuggester {
    return result;
  }

-  private String tryUpperCase(String candidate) {
-    String upper = candidate.toUpperCase(Locale.ROOT);
-    if (upper.contains(" ") || speller.spell(upper)) {
-      return upper;
-    }
-    String title = speller.dictionary.toTitleCase(candidate);
-    return speller.spell(title) ? title : candidate;
-  }
-
  private void tryVariationsOf(String word) {
-    boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
+    hasGoodSuggestions |= trySuggestion(word.toUpperCase(Locale.ROOT));
    hasGoodSuggestions |= tryRep(word);

    if (!speller.dictionary.mapTable.isEmpty()) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@ -24,7 +24,9 @@ import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;

 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.LinkedHashSet;
 import java.util.List;
+import java.util.Locale;
 import java.util.Set;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
@ -415,16 +417,44 @@ public class SpellChecker {
      word = dictionary.cleanInput(word, new StringBuilder()).toString();
    }

+    WordCase wordCase = WordCase.caseOf(word);
    ModifyingSuggester modifier = new ModifyingSuggester(this);
-    Set<String> result = modifier.suggest(word);
+    Set<String> suggestions = modifier.suggest(word, wordCase);

-    if (word.contains("-") && result.stream().noneMatch(s -> s.contains("-"))) {
-      result.addAll(modifyChunksBetweenDashes(word));
+    if (!modifier.hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
+      suggestions.addAll(
+          new GeneratingSuggester(dictionary)
+              .suggest(dictionary.toLowerCase(word), wordCase, suggestions));
    }

+    if (word.contains("-") && suggestions.stream().noneMatch(s -> s.contains("-"))) {
+      suggestions.addAll(modifyChunksBetweenDashes(word));
+    }
+
+    Set<String> result = new LinkedHashSet<>();
+    for (String candidate : suggestions) {
+      result.add(adjustSuggestionCase(candidate, wordCase));
+      if (wordCase == WordCase.UPPER && dictionary.checkSharpS && candidate.contains("ß")) {
+        result.add(candidate);
+      }
+    }
    return new ArrayList<>(result);
  }

+  private String adjustSuggestionCase(String candidate, WordCase original) {
+    if (original == WordCase.UPPER) {
+      String upper = candidate.toUpperCase(Locale.ROOT);
+      if (upper.contains(" ") || spell(upper)) {
+        return upper;
+      }
+    }
+    if (original == WordCase.UPPER || original == WordCase.TITLE) {
+      String title = dictionary.toTitleCase(candidate);
+      return spell(title) ? title : candidate;
+    }
+    return candidate;
+  }
+
  private List<String> modifyChunksBetweenDashes(String word) {
    List<String> result = new ArrayList<>();
    int chunkStart = 0;
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base_utf.sug
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base_utf.sug
@ -0,0 +1,13 @@
+looked, look
+text
+hello
+said
+rotten day, rotten-day, rotten
+tomorrow
+seven
+NASA
+horrifying
+speech
+suggest
+Imply
+IMPLY
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.sug
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.sug
@ -0,0 +1 @@
+MÜSSIG, müßig
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.sug
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.sug
@ -0,0 +1,8 @@
+foo
+foo
+Bar
+Bar, baz.
+baz.
+baz.
+Quux.
+Quux.