LUCENE-9734: Hunspell: support suggestions based on "ph" morphological data (#2308)

2021-02-06 17:04:12 +01:00 · 2021-02-06 17:04:12 +01:00 · 1852d7ad5a
parent 573b442903
commit 1852d7ad5a
14 changed files with 321 additions and 54 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -44,6 +44,8 @@ import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
@ -80,6 +82,7 @@ public class Dictionary {
  // TODO: really for suffixes we should reverse the automaton and run them backwards
  private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
  private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
+  private static final Pattern MORPH_KEY_PATTERN = Pattern.compile("\\s+(?=\\p{Alpha}{2}:)");
  static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
  CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET);

@ -386,8 +389,7 @@ public class Dictionary {
        fullStrip = true;
      } else if ("LANG".equals(firstWord)) {
        language = singleArgument(reader, line);
-        String langCode = extractLanguageCode(language);
-        alternateCasing = langCode.equals("tr") || langCode.equals("az");
+        this.alternateCasing = hasLanguage("tr", "az");
      } else if ("BREAK".equals(firstWord)) {
        breaks = parseBreaks(reader, line);
      } else if ("WORDCHARS".equals(firstWord)) {
@ -463,6 +465,17 @@ public class Dictionary {
    stripOffsets[currentIndex] = currentOffset;
  }

+  private boolean hasLanguage(String... langCodes) {
+    if (language == null) return false;
+    String langCode = extractLanguageCode(language);
+    for (String code : langCodes) {
+      if (langCode.equals(code)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
  static String extractLanguageCode(String isoCode) {
    int underscore = isoCode.indexOf("_");
    return underscore < 0 ? isoCode : isoCode.substring(0, underscore);
@ -910,7 +923,7 @@ public class Dictionary {
          if (!hasStemExceptions) {
            int morphStart = line.indexOf(MORPH_SEPARATOR);
            if (morphStart >= 0 && morphStart < line.length()) {
-              hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
+              hasStemExceptions = hasStemException(line.substring(morphStart + 1));
            }
          }

@ -963,6 +976,23 @@ public class Dictionary {
    writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
  }

+  String toLowerCase(String word) {
+    char[] chars = new char[word.length()];
+    for (int i = 0; i < word.length(); i++) {
+      chars[i] = caseFold(word.charAt(i));
+    }
+    return new String(chars);
+  }
+
+  String toTitleCase(String word) {
+    char[] chars = new char[word.length()];
+    chars[0] = Character.toUpperCase(word.charAt(0));
+    for (int i = 1; i < word.length(); i++) {
+      chars[i] = caseFold(word.charAt(i));
+    }
+    return new String(chars);
+  }
+
  private String sortWordsOffline(
      Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
    OfflineSorter sorter =
@ -1062,13 +1092,14 @@ public class Dictionary {
        }
        // we possibly have morphological data
        int stemExceptionID = 0;
-        if (hasStemExceptions && end + 1 < line.length()) {
-          String stemException = parseStemException(line.substring(end + 1));
-          if (stemException != null) {
-            stemExceptions = ArrayUtil.grow(stemExceptions, stemExceptionCount + 1);
-            stemExceptionID =
-                stemExceptionCount + 1; // we use '0' to indicate no exception for the form
-            stemExceptions[stemExceptionCount++] = stemException;
+        if (end + 1 < line.length()) {
+          String morphData = line.substring(end + 1);
+          for (String datum : splitMorphData(morphData)) {
+            if (datum.startsWith("st:")) {
+              stemExceptionID = addStemException(datum.substring(3));
+            } else if (datum.startsWith("ph:") && datum.length() > 3) {
+              addPhoneticRepEntries(entry, datum.substring(3));
+            }
          }
        }

@ -1088,6 +1119,52 @@ public class Dictionary {
    }
  }

+  private int addStemException(String stemException) {
+    stemExceptions = ArrayUtil.grow(stemExceptions, stemExceptionCount + 1);
+    stemExceptions[stemExceptionCount++] = stemException;
+    return stemExceptionCount; // we use '0' to indicate no exception for the form
+  }
+
+  private void addPhoneticRepEntries(String word, String ph) {
+    // e.g. "pretty ph:prity ph:priti->pretti" to suggest both prity->pretty and pritier->prettiest
+    int arrow = ph.indexOf("->");
+    String pattern;
+    String replacement;
+    if (arrow > 0) {
+      pattern = ph.substring(0, arrow);
+      replacement = ph.substring(arrow + 2);
+    } else {
+      pattern = ph;
+      replacement = word;
+    }
+
+    // when the ph: field ends with *, strip last character of pattern and replacement
+    // e.g., "pretty ph:prity*" results in "prit->prett" replacement instead of "prity->pretty",
+    // to get both prity->pretty and pritiest->prettiest suggestions.
+    if (pattern.endsWith("*") && pattern.length() > 2 && replacement.length() > 1) {
+      pattern = pattern.substring(0, pattern.length() - 2);
+      replacement = replacement.substring(0, replacement.length() - 1);
+    }
+
+    // capitalize lowercase pattern for capitalized words to support
+    // good suggestions also for capitalized misspellings,
+    // e.g. Wednesday ph:wendsay results in wendsay -> Wednesday and Wendsay -> Wednesday.
+    if (WordCase.caseOf(word) == WordCase.TITLE && WordCase.caseOf(pattern) == WordCase.LOWER) {
+      // add also lowercase word in the case of German or
+      // Hungarian to support lowercase suggestions lowercased by
+      // compound word generation or derivational suffixes
+      // for example by adjectival suffix "-i" of geographical names in Hungarian:
+      // Massachusetts ph:messzecsuzec
+      // messzecsuzeci -> massachusettsi (adjective)
+      // For lowercasing by conditional PFX rules, see e.g. germancompounding test
+      if (hasLanguage("de", "hu")) {
+        repTable.add(new RepEntry(pattern, toLowerCase(replacement)));
+      }
+      repTable.add(new RepEntry(toTitleCase(pattern), replacement));
+    }
+    repTable.add(new RepEntry(pattern, replacement));
+  }
+
  boolean isDotICaseChangeDisallowed(char[] word) {
    return word[0] == 'İ' && !alternateCasing;
  }
@ -1220,29 +1297,31 @@ public class Dictionary {
    }
  }

-  private String parseStemException(String morphData) {
+  private boolean hasStemException(String morphData) {
+    for (String datum : splitMorphData(morphData)) {
+      if (datum.startsWith("st:")) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  private List<String> splitMorphData(String morphData) {
    // first see if it's an alias
    if (morphAliasCount > 0) {
      try {
        int alias = Integer.parseInt(morphData.trim());
        morphData = morphAliases[alias - 1];
-      } catch (NumberFormatException e) {
-        // fine
+      } catch (NumberFormatException ignored) {
      }
    }
-    // try to parse morph entry
-    int index = morphData.indexOf(" st:");
-    if (index < 0) {
-      index = morphData.indexOf("\tst:");
+    if (morphData.isBlank()) {
+      return Collections.emptyList();
    }
-    if (index >= 0) {
-      int endIndex = indexOfSpaceOrTab(morphData, index + 1);
-      if (endIndex < 0) {
-        endIndex = morphData.length();
-      }
-      return morphData.substring(index + 4, endIndex);
-    }
-    return null;
+    return Arrays.stream(MORPH_KEY_PATTERN.split(morphData))
+        .map(String::trim)
+        .filter(s -> !s.isBlank())
+        .collect(Collectors.toList());
  }

  boolean isForbiddenWord(char[] word, int length, BytesRef scratch) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@ -16,9 +16,12 @@
 */
 package org.apache.lucene.analysis.hunspell;

+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.LinkedHashSet;
+import java.util.List;
 import java.util.Locale;
+import java.util.stream.Collectors;

 class ModifyingSuggester {
  private static final int MAX_CHAR_DISTANCE = 4;
@ -36,6 +39,14 @@ class ModifyingSuggester {

    WordCase wc = WordCase.caseOf(word);

+    if (wc == WordCase.UPPER) {
+      tryVariationsOf(speller.dictionary.toLowerCase(word));
+      tryVariationsOf(speller.dictionary.toTitleCase(word));
+      return result.stream()
+          .map(this::tryUpperCase)
+          .collect(Collectors.toCollection(LinkedHashSet::new));
+    }
+
    if (wc == WordCase.MIXED) {
      int dot = word.indexOf('.');
      if (dot > 0
@ -44,27 +55,24 @@ class ModifyingSuggester {
        result.add(word.substring(0, dot + 1) + " " + word.substring(dot + 1));
      }

-      tryVariationsOf(toLowerCase(word));
+      tryVariationsOf(speller.dictionary.toLowerCase(word));
    }

    return result;
  }

-  private String toLowerCase(String word) {
-    char[] chars = new char[word.length()];
-    for (int i = 0; i < word.length(); i++) {
-      chars[i] = speller.dictionary.caseFold(word.charAt(i));
+  private String tryUpperCase(String candidate) {
+    String upper = candidate.toUpperCase(Locale.ROOT);
+    if (upper.contains(" ") || speller.spell(upper)) {
+      return upper;
    }
-    return new String(chars);
+    String title = speller.dictionary.toTitleCase(candidate);
+    return speller.spell(title) ? title : candidate;
  }

  private void tryVariationsOf(String word) {
-    trySuggestion(word.toUpperCase(Locale.ROOT));
-    if (checkDictionaryForSplitSuggestions(word)) {
-      return;
-    }
-
-    tryRep(word);
+    boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
+    hasGoodSuggestions |= tryRep(word);

    trySwappingChars(word);
    tryLongSwap(word);
@ -75,12 +83,24 @@ class ModifyingSuggester {
    tryReplacingChar(word);
    tryTwoDuplicateChars(word);

-    if (speller.dictionary.enableSplitSuggestions) {
+    List<String> goodSplit = checkDictionaryForSplitSuggestions(word);
+    if (!goodSplit.isEmpty()) {
+      List<String> copy = new ArrayList<>(result);
+      result.clear();
+      result.addAll(goodSplit);
+      if (hasGoodSuggestions) {
+        result.addAll(copy);
+      }
+      hasGoodSuggestions = true;
+    }
+
+    if (!hasGoodSuggestions && speller.dictionary.enableSplitSuggestions) {
      trySplitting(word);
    }
  }

-  private void tryRep(String word) {
+  private boolean tryRep(String word) {
+    int before = result.size();
    for (RepEntry entry : speller.dictionary.repTable) {
      for (String candidate : entry.substitute(word)) {
        if (trySuggestion(candidate)) {
@ -88,11 +108,16 @@ class ModifyingSuggester {
        }

        if (candidate.contains(" ")
-            && Arrays.stream(candidate.split(" ")).allMatch(speller::checkWord)) {
+            && Arrays.stream(candidate.split(" ")).allMatch(this::checkSimpleWord)) {
          result.add(candidate);
        }
      }
    }
+    return result.size() > before;
+  }
+
+  private boolean checkSimpleWord(String part) {
+    return Boolean.TRUE.equals(speller.checkSimpleWord(part.toCharArray(), part.length(), null));
  }

  private void trySwappingChars(String word) {
@ -213,24 +238,30 @@ class ModifyingSuggester {
    }
  }

-  private boolean checkDictionaryForSplitSuggestions(String word) {
-    boolean found = false;
+  private List<String> checkDictionaryForSplitSuggestions(String word) {
+    List<String> result = new ArrayList<>();
    for (int i = 1; i < word.length() - 1; i++) {
      String w1 = word.substring(0, i);
      String w2 = word.substring(i);
-      found |= trySuggestion(w1 + " " + w2);
+      String spaced = w1 + " " + w2;
+      if (speller.checkWord(spaced)) {
+        result.add(spaced);
+      }
      if (shouldSplitByDash()) {
-        found |= trySuggestion(w1 + "-" + w2);
+        String dashed = w1 + "-" + w2;
+        if (speller.checkWord(dashed)) {
+          result.add(dashed);
+        }
      }
    }
-    return found;
+    return result;
  }

  private void trySplitting(String word) {
    for (int i = 1; i < word.length() - 1; i++) {
      String w1 = word.substring(0, i);
      String w2 = word.substring(i);
-      if (speller.checkWord(w1) && speller.checkWord(w2)) {
+      if (checkSimpleWord(w1) && checkSimpleWord(w2)) {
        result.add(w1 + " " + w2);
        if (shouldSplitByDash()) {
          result.add(w1 + "-" + w2);
@ -244,10 +275,6 @@ class ModifyingSuggester {
  }

  private boolean trySuggestion(String candidate) {
-    if (speller.checkWord(candidate)) {
-      result.add(candidate);
-      return true;
-    }
-    return false;
+    return speller.checkWord(candidate) && result.add(candidate);
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@ -134,7 +134,7 @@ public class SpellChecker {
    return checkWord(word.toCharArray(), word.length(), null);
  }

-  private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
+  Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) {
    if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
      return false;
    }
@ -143,6 +143,15 @@ public class SpellChecker {
      return true;
    }

+    return null;
+  }
+
+  private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
+    Boolean simpleResult = checkSimpleWord(wordChars, length, originalCase);
+    if (simpleResult != null) {
+      return simpleResult;
+    }
+
    if (dictionary.compoundRules != null
        && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
      return true;
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@ -44,10 +44,18 @@ public class SpellCheckerTest extends StemmerTestBase {
    doTest("allcaps");
  }

-  public void rep() throws Exception {
+  public void testRepSuggestions() throws Exception {
    doTest("rep");
  }

+  public void testPhSuggestions() throws Exception {
+    doTest("ph");
+  }
+
+  public void testPhSuggestions2() throws Exception {
+    doTest("ph2");
+  }
+
  public void testForceUCase() throws Exception {
    doTest("forceucase");
  }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.aff
@ -0,0 +1,30 @@
+# new suggestion methods of Hunspell 1.7:
+# ph: for dictionary-based suggestions.
+#
+# For example, suggestions for "wich"
+# with this test dictonary:
+#
+# Hunspell 1.3.3
+# wich
+# & wich 4 0: winch, witch, which, wish
+#
+# Hunspell 1.6.2
+# wich
+# & wich 4 0: which, witch, winch, wish
+#
+# Suggestions will be limited for
+# the dictionary words with the same ph: field,
+# and for non-ngram suggestions.
+#
+# Order of the ph: suggestions for the
+# same mispelling, eg. wich -> which, witch
+# follows the order of the words in the dictionary:
+#
+# which ph:wich
+# witch ph:witch
+#
+# switch off ngram suggestions to check only
+# ph: based suggestions
+MAXNGRAMSUGS 0
+
+TRY esianrtolcdugmphbyfvkwzESIANRTOLCDUGMPHBYFVKWZ'-
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.dic
@ -0,0 +1,11 @@
+8
+a lot ph:alot
+in spite ph:inspite
+inspire
+what ph:whta ph:waht
+Wednesday ph:wendsay ph:wensday
+which ph:wich
+witch ph:wich
+winch
+wish
+Oh, my gosh! ph:omg
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.sug
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.sug
@ -0,0 +1,11 @@
+a lot
+in spite, inspire
+what
+what
+Wednesday
+Wednesday
+Wednesday
+Wednesday
+which, witch, winch, wish
+Oh, my gosh!
+OH, MY GOSH!
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.wrong
@ -0,0 +1,11 @@
+alot
+inspite
+whta
+waht
+wensday
+wendsay
+Wensday
+Wendsay
+wich
+omg
+OMG
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.aff
@ -0,0 +1,32 @@
+# switch off ngram suggestion to test
+# usage of ph: dictionary fields in REP
+# suggestions
+SET UTF-8
+
+MAXNGRAMSUGS 0
+
+# test in compounds, too
+COMPOUNDFLAG Y
+
+# test also dictionary items with space,
+# and forbidden compounding, if there is
+# a ph: field with that compound as
+# mispelling in the dictionary
+CHECKCOMPOUNDREP
+
+# test in compound word with affixes
+SFX A Y 1
+SFX A 0 's .
+
+# when the ph: field ends with the character *,
+# strip last character of the pattern and the replacement
+# to match in REP suggestions also at character changes,
+# for example, "pretty ph:prity*" results "prit->prett"
+# REP replacement instead of "prity->pretty", to get
+# prity->pretty and pritiest->prettiest suggestions.
+
+SFX B Y 2
+SFX B y iest [^aeiou]y
+SFX B ö őt ö
+
+WORDCHARS '
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.dic
@ -0,0 +1,11 @@
+9
+foo ph:bar ph:baz
+foo bar ph:foobar
+word/Y ph:baz
+stem/Y ph: ph:
+forbidden/Y
+root/YA
+forbidden root/A ph:forbiddenroot
+pretty/B ph:prity*
+foobarö/B ph:fubarő*
+happy/B ph:hepy ph:hepi->happi
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.good
@ -0,0 +1,9 @@
+foo
+word
+stem
+wordstem
+stemword
+rootforbidden
+root's
+foobarö
+foobarőt
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.sug
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.sug
@ -0,0 +1,14 @@
+foo
+foo, word
+foo bar
+wordstem
+stemword
+stemwordstem
+forbidden root
+forbidden root's
+pretty
+prettiest
+foobarö
+foobarőt
+happy
+happiest
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.wrong
@ -0,0 +1,15 @@
+bar
+baz
+foobar
+bazstem
+stembaz
+stembazstem
+forbiddenroot
+forbiddenroot's
+rootforbiddenroot
+prity
+pritiest
+fubarö
+fubarőt
+hepy
+hepiest
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/rep.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/rep.aff
@ -10,7 +10,7 @@ REP shun$ tion
 REP ^alot$ a_lot  # add the highest priority for "a lot" suggestion to "alot"
 REP ^foo$ bar
 REP ' _    # "un'alunno" -> "un alunno"
-REP ^vinte<EFBFBD>n$ vinte_e_un
+REP ^vinteún$ vinte_e_un
 REP s 's