LUCENE-9736: Hunspell: support MAP-based suggestions for groups of similar letters (#2314)

2021-02-08 10:59:53 +01:00 · 2021-02-08 10:59:53 +01:00 · 653626399f
parent 061233ca4e
commit 653626399f
7 changed files with 73 additions and 0 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -168,6 +168,7 @@ public class Dictionary {
  String[] neighborKeyGroups = new String[0];
  boolean enableSplitSuggestions = true;
  List<RepEntry> repTable = new ArrayList<>();
+  List<List<String>> mapTable = new ArrayList<>();

  // FSTs used for ICONV/OCONV, output ord pointing to replacement text
  FST<CharsRef> iconv;
@ -399,6 +400,11 @@ public class Dictionary {
          String[] parts = splitBySpace(reader, reader.readLine(), 3, Integer.MAX_VALUE);
          repTable.add(new RepEntry(parts[1], parts[2]));
        }
+      } else if ("MAP".equals(firstWord)) {
+        int count = parseNum(reader, line);
+        for (int i = 0; i < count; i++) {
+          mapTable.add(parseMapEntry(reader, reader.readLine()));
+        }
      } else if ("KEY".equals(firstWord)) {
        neighborKeyGroups = singleArgument(reader, line).split("\\|");
      } else if ("NOSPLITSUGS".equals(firstWord)) {
@ -462,6 +468,25 @@ public class Dictionary {
    stripOffsets[currentIndex] = currentOffset;
  }

+  private List<String> parseMapEntry(LineNumberReader reader, String line) throws ParseException {
+    String unparsed = singleArgument(reader, line);
+    List<String> mapEntry = new ArrayList<>();
+    for (int j = 0; j < unparsed.length(); j++) {
+      if (unparsed.charAt(j) == '(') {
+        int closing = unparsed.indexOf(')', j);
+        if (closing < 0) {
+          throw new ParseException("Unclosed parenthesis: " + line, reader.getLineNumber());
+        }
+
+        mapEntry.add(unparsed.substring(j + 1, closing));
+        j = closing;
+      } else {
+        mapEntry.add(String.valueOf(unparsed.charAt(j)));
+      }
+    }
+    return mapEntry;
+  }
+
  private boolean hasLanguage(String... langCodes) {
    if (language == null) return false;
    String langCode = extractLanguageCode(language);
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@ -74,6 +74,10 @@ class ModifyingSuggester {
    boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
    hasGoodSuggestions |= tryRep(word);

+    if (!speller.dictionary.mapTable.isEmpty()) {
+      enumerateMapReplacements(word, "", 0);
+    }
+
    trySwappingChars(word);
    tryLongSwap(word);
    tryNeighborKeys(word);
@ -116,6 +120,27 @@ class ModifyingSuggester {
    return result.size() > before;
  }

+  private void enumerateMapReplacements(String word, String accumulated, int offset) {
+    if (offset == word.length()) {
+      trySuggestion(accumulated);
+      return;
+    }
+
+    for (List<String> entries : speller.dictionary.mapTable) {
+      for (String entry : entries) {
+        if (word.regionMatches(offset, entry, 0, entry.length())) {
+          for (String replacement : entries) {
+            if (!entry.equals(replacement)) {
+              enumerateMapReplacements(word, accumulated + replacement, offset + entry.length());
+            }
+          }
+        }
+      }
+    }
+
+    enumerateMapReplacements(word, accumulated + word.charAt(offset), offset + 1);
+  }
+
  private boolean checkSimpleWord(String part) {
    return Boolean.TRUE.equals(speller.checkSimpleWord(part.toCharArray(), part.length(), null));
  }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@ -180,6 +180,10 @@ public class SpellCheckerTest extends StemmerTestBase {
    doTest("sug2");
  }

+  public void testMapSuggestions() throws Exception {
+    doTest("map");
+  }
+
  protected void doTest(String name) throws Exception {
    checkSpellCheckerExpectations(
        Path.of(getClass().getResource(name + ".aff").toURI()).getParent().resolve(name), true);
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.aff
@ -0,0 +1,9 @@
+# With MAP suggestion, Hunspell can add missing accents to a word.
+
+# switch off ngram suggestion for testing
+MAXNGRAMSUGS 0
+
+MAP 3
+MAP uúü
+MAP oóö
+MAP ß(ss)
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.dic
@ -0,0 +1,4 @@
+3
+Fr<EFBFBD>hst<EFBFBD>ck
+t<EFBFBD>k<EFBFBD>rf<EFBFBD>r<EFBFBD>
+gro<EFBFBD>
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.sug
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.sug
@ -0,0 +1,3 @@
+Frühstück
+tükörfúró
+groß
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.wrong
@ -0,0 +1,3 @@
+Fruhstuck
+tukorfuro
+gross