From a79f641561923d8314519962410bc871d9f79add Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Wed, 3 Feb 2021 17:45:56 +0100 Subject: [PATCH] LUCENE-9720: Hunspell: more ways to vary misspelled word variations for suggestions (#2286) --- .../lucene/analysis/hunspell/Dictionary.java | 6 + .../analysis/hunspell/ModifyingSuggester.java | 187 +++++++++++++++++- .../analysis/hunspell/SpellChecker.java | 5 +- .../lucene/analysis/hunspell/WordCase.java | 4 + .../apache/lucene/analysis/hunspell/IJ.sug | 1 + .../analysis/hunspell/SpellCheckerTest.java | 8 + .../apache/lucene/analysis/hunspell/sug.aff | 22 +++ .../apache/lucene/analysis/hunspell/sug.dic | 15 ++ .../apache/lucene/analysis/hunspell/sug.sug | 15 ++ .../apache/lucene/analysis/hunspell/sug.wrong | 15 ++ .../apache/lucene/analysis/hunspell/sug2.aff | 25 +++ .../apache/lucene/analysis/hunspell/sug2.dic | 12 ++ .../apache/lucene/analysis/hunspell/sug2.sug | 3 + .../lucene/analysis/hunspell/sug2.wrong | 3 + 14 files changed, 318 insertions(+), 3 deletions(-) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/IJ.sug create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.dic create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.sug create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.wrong create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.dic create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.sug create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.wrong diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 47c57a3bd4c..7b0bd5fb40c 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -152,6 +152,8 @@ public class Dictionary { private char[] ignore; String tryChars = ""; + String[] neighborKeyGroups = new String[0]; + boolean enableSplitSuggestions = true; List repTable = new ArrayList<>(); // FSTs used for ICONV/OCONV, output ord pointing to replacement text @@ -385,6 +387,10 @@ public class Dictionary { String[] parts = splitBySpace(reader, reader.readLine(), 3); repTable.add(new RepEntry(parts[1], parts[2])); } + } else if ("KEY".equals(firstWord)) { + neighborKeyGroups = singleArgument(reader, line).split("\\|"); + } else if ("NOSPLITSUGS".equals(firstWord)) { + enableSplitSuggestions = false; } else if ("FORBIDDENWORD".equals(firstWord)) { forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line)); } else if ("COMPOUNDMIN".equals(firstWord)) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java index 02fa0b47701..4dd91c09b05 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java @@ -18,8 +18,10 @@ package org.apache.lucene.analysis.hunspell; import java.util.Arrays; import java.util.LinkedHashSet; +import java.util.Locale; class ModifyingSuggester { + private static final int MAX_CHAR_DISTANCE = 4; private final LinkedHashSet result = new LinkedHashSet<>(); private final char[] tryChars; private final SpellChecker speller; @@ -30,11 +32,54 @@ class ModifyingSuggester { } LinkedHashSet suggest(String word) { - tryRep(word); - tryAddingChar(word); + tryVariationsOf(word); + + WordCase wc = WordCase.caseOf(word); + + if (wc == WordCase.MIXED) { + int dot = word.indexOf('.'); + if (dot > 0 + && dot < word.length() - 1 + && WordCase.caseOf(word.substring(dot + 1)) == WordCase.TITLE) { + result.add(word.substring(0, dot + 1) + " " + word.substring(dot + 1)); + } + + tryVariationsOf(toLowerCase(word)); + } + return result; } + private String toLowerCase(String word) { + char[] chars = new char[word.length()]; + for (int i = 0; i < word.length(); i++) { + chars[i] = speller.dictionary.caseFold(word.charAt(i)); + } + return new String(chars); + } + + private void tryVariationsOf(String word) { + trySuggestion(word.toUpperCase(Locale.ROOT)); + if (checkDictionaryForSplitSuggestions(word)) { + return; + } + + tryRep(word); + + trySwappingChars(word); + tryLongSwap(word); + tryNeighborKeys(word); + tryRemovingChar(word); + tryAddingChar(word); + tryMovingChar(word); + tryReplacingChar(word); + tryTwoDuplicateChars(word); + + if (speller.dictionary.enableSplitSuggestions) { + trySplitting(word); + } + } + private void tryRep(String word) { for (RepEntry entry : speller.dictionary.repTable) { for (String candidate : entry.substitute(word)) { @@ -50,6 +95,75 @@ class ModifyingSuggester { } } + private void trySwappingChars(String word) { + int length = word.length(); + for (int i = 0; i < length - 1; i++) { + char c1 = word.charAt(i); + char c2 = word.charAt(i + 1); + trySuggestion(word.substring(0, i) + c2 + c1 + word.substring(i + 2)); + } + + if (length == 4 || length == 5) { + tryDoubleSwapForShortWords(word, length); + } + } + + // ahev -> have, owudl -> would + private void tryDoubleSwapForShortWords(String word, int length) { + char[] candidate = word.toCharArray(); + candidate[0] = word.charAt(1); + candidate[1] = word.charAt(0); + candidate[length - 1] = word.charAt(length - 2); + candidate[length - 2] = word.charAt(length - 1); + trySuggestion(new String(candidate)); + + if (candidate.length == 5) { + candidate[0] = word.charAt(0); + candidate[1] = word.charAt(2); + candidate[2] = word.charAt(1); + trySuggestion(new String(candidate)); + } + } + + private void tryNeighborKeys(String word) { + for (int i = 0; i < word.length(); i++) { + char c = word.charAt(i); + char up = Character.toUpperCase(c); + if (up != c) { + trySuggestion(word.substring(0, i) + up + word.substring(i + 1)); + } + + // check neighbor characters in keyboard string + for (String group : speller.dictionary.neighborKeyGroups) { + if (group.indexOf(c) >= 0) { + for (int j = 0; j < group.length(); j++) { + if (group.charAt(j) != c) { + trySuggestion(word.substring(0, i) + group.charAt(j) + word.substring(i + 1)); + } + } + } + } + } + } + + private void tryLongSwap(String word) { + for (int i = 0; i < word.length(); i++) { + for (int j = i + 2; j < word.length() && j <= i + MAX_CHAR_DISTANCE; j++) { + char c1 = word.charAt(i); + char c2 = word.charAt(j); + String prefix = word.substring(0, i); + String suffix = word.substring(j + 1); + trySuggestion(prefix + c2 + word.substring(i + 1, j) + c1 + suffix); + } + } + } + + private void tryRemovingChar(String word) { + for (int i = 0; i < word.length(); i++) { + trySuggestion(word.substring(0, i) + word.substring(i + 1)); + } + } + private void tryAddingChar(String word) { for (int i = 0; i <= word.length(); i++) { String prefix = word.substring(0, i); @@ -60,6 +174,75 @@ class ModifyingSuggester { } } + private void tryMovingChar(String word) { + for (int i = 0; i < word.length(); i++) { + for (int j = i + 2; j < word.length() && j <= i + MAX_CHAR_DISTANCE; j++) { + String prefix = word.substring(0, i); + trySuggestion(prefix + word.substring(i + 1, j) + word.charAt(i) + word.substring(j)); + trySuggestion(prefix + word.charAt(j) + word.substring(i, j) + word.substring(j + 1)); + } + } + } + + private void tryReplacingChar(String word) { + for (int i = 0; i < word.length(); i++) { + String prefix = word.substring(0, i); + String suffix = word.substring(i + 1); + for (char toInsert : tryChars) { + if (toInsert != word.charAt(i)) { + trySuggestion(prefix + toInsert + suffix); + } + } + } + } + + // perhaps we doubled two characters + // (for example vacation -> vacacation) + private void tryTwoDuplicateChars(String word) { + int dupLen = 0; + for (int i = 2; i < word.length(); i++) { + if (word.charAt(i) == word.charAt(i - 2)) { + dupLen++; + if (dupLen == 3 || dupLen == 2 && i >= 4) { + trySuggestion(word.substring(0, i - 1) + word.substring(i + 1)); + dupLen = 0; + } + } else { + dupLen = 0; + } + } + } + + private boolean checkDictionaryForSplitSuggestions(String word) { + boolean found = false; + for (int i = 1; i < word.length() - 1; i++) { + String w1 = word.substring(0, i); + String w2 = word.substring(i); + found |= trySuggestion(w1 + " " + w2); + if (shouldSplitByDash()) { + found |= trySuggestion(w1 + "-" + w2); + } + } + return found; + } + + private void trySplitting(String word) { + for (int i = 1; i < word.length() - 1; i++) { + String w1 = word.substring(0, i); + String w2 = word.substring(i); + if (speller.checkWord(w1) && speller.checkWord(w2)) { + result.add(w1 + " " + w2); + if (shouldSplitByDash()) { + result.add(w1 + "-" + w2); + } + } + } + } + + private boolean shouldSplitByDash() { + return speller.dictionary.tryChars.contains("-") || speller.dictionary.tryChars.contains("a"); + } + private boolean trySuggestion(String candidate) { if (speller.checkWord(candidate)) { result.add(candidate); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java index 747b209fa32..d69940c0562 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java @@ -414,7 +414,10 @@ public class SpellChecker { String chunk = word.substring(chunkStart, chunkEnd); if (!spell(chunk)) { for (String chunkSug : suggest(chunk)) { - result.add(word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd)); + String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd); + if (!dictionary.isForbiddenWord(replaced.toCharArray(), replaced.length(), scratch)) { + result.add(replaced); + } } } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java index 01fffd914c0..1499ee46ca0 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java @@ -37,6 +37,10 @@ enum WordCase { return get(startsWithLower, seenUpper, seenLower); } + static WordCase caseOf(CharSequence word) { + return caseOf(word, word.length()); + } + static WordCase caseOf(CharSequence word, int length) { boolean startsWithLower = Character.isLowerCase(word.charAt(0)); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/IJ.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/IJ.sug new file mode 100644 index 00000000000..582b7956b5f --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/IJ.sug @@ -0,0 +1 @@ +IJs, ijs diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java index 49514ae6d8b..eedef38217e 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java @@ -156,6 +156,14 @@ public class SpellCheckerTest extends StemmerTestBase { doTest("germancompounding"); } + public void testModifyingSuggestions() throws Exception { + doTest("sug"); + } + + public void testModifyingSuggestions2() throws Exception { + doTest("sug2"); + } + protected void doTest(String name) throws Exception { checkSpellCheckerExpectations( Path.of(getClass().getResource(name + ".aff").toURI()).getParent().resolve(name), true); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.aff new file mode 100644 index 00000000000..8f150cdf4cd --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.aff @@ -0,0 +1,22 @@ +# new suggestion methods of Hunspell 1.5: +# capitalization: nasa -> NASA +# long swap: permenant -> permanent +# long mov: Ghandi -> Gandhi +# double two characters: vacacation -> vacation +# space with REP: "alot" -> "a lot" ("a lot" need to be in the dic file.) +# +# Note: see test "ph" for the newer and +# more simple method to handle common misspellings, +# for example, alot->a lot, inspite->in spite, +# (that is giving the best suggestion, and limiting +# ngram/phonetic suggestion) + +# switch off ngram suggestion for testing +MAXNGRAMSUGS 0 +REP 2 +REP alot a_lot +REP inspite in_spite +KEY qwertzuiop|asdfghjkl|yxcvbnm|aq +WORDCHARS .- +FORBIDDENWORD ? + diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.dic new file mode 100644 index 00000000000..1d019ceb401 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.dic @@ -0,0 +1,15 @@ +13 +NASA +Gandhi +grateful +permanent +vacation +a +lot +have +which +McDonald +permanent-vacation/? +in +spite +inspire diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.sug new file mode 100644 index 00000000000..bea54b8f02f --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.sug @@ -0,0 +1,15 @@ +NASA +Gandhi +grateful +permanent +vacation +a lot, lot +in spite, inspire +permanent. Vacation +have +which +Gandhi +McDonald +permanent + + diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.wrong new file mode 100644 index 00000000000..0093de893ab --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug.wrong @@ -0,0 +1,15 @@ +nasa +Ghandi +greatful +permenant +vacacation +alot +inspite +permanent.Vacation +ahev +hwihc +GAndhi +Mcdonald +permqnent +permanent-vacation +permqnent-vacation diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.aff new file mode 100644 index 00000000000..bb7c8803df2 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.aff @@ -0,0 +1,25 @@ +# new suggestion methods of Hunspell 1.7: +# dictionary word pairs with spaces or dashes +# got top priority, and removes other not +# "good" (uppercase, REP, ph:) suggestions: +# +# "alot" -> "a lot" +# +# Note: use ph: at the dictionary word pair +# with space or dash to keep the other not +# "good" suggestions, for example +# +# a lot ph:alot +# +# results "alot" -> "a lot", "alto", "slot"... + +# switch off ngram suggestion for testing +MAXNGRAMSUGS 0 +KEY qwertzuiop|asdfghjkl|yxcvbnm|aq + +# Note: TRY with a letter "a" or "-" needs for +# checking dictionary word pairs with dashes +TRY esianrtolcdugmphbyfvkwz' +WORDCHARS .- +FORBIDDENWORD ? + diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.dic new file mode 100644 index 00000000000..86311a9e9d5 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.dic @@ -0,0 +1,12 @@ +11 +a +lot +a lot +alto +in +spite +in spite +inspire +scot +free +scot-free diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.sug new file mode 100644 index 00000000000..65b7537d754 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.sug @@ -0,0 +1,3 @@ +a lot +in spite +scot-free diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.wrong new file mode 100644 index 00000000000..4cfc5697c22 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/sug2.wrong @@ -0,0 +1,3 @@ +alot +inspite +scotfree