From c3166e1dc355e827b19067f037b3a127b2ef79fa Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Wed, 10 Feb 2021 09:21:01 +0100 Subject: [PATCH] LUCENE-9750: Hunspell: improve suggestions for mixed-case misspelled words (#2332) --- .../lucene/analysis/hunspell/Dictionary.java | 2 +- .../analysis/hunspell/ModifyingSuggester.java | 38 +++++++++++++++++-- .../analysis/hunspell/SpellChecker.java | 14 ++++--- .../analysis/hunspell/SpellCheckerTest.java | 4 ++ .../lucene/analysis/hunspell/i58202.aff | 4 ++ .../lucene/analysis/hunspell/i58202.dic | 5 +++ .../lucene/analysis/hunspell/i58202.good | 10 +++++ .../lucene/analysis/hunspell/i58202.sug | 13 +++++++ .../lucene/analysis/hunspell/i58202.wrong | 13 +++++++ 9 files changed, 93 insertions(+), 10 deletions(-) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.dic create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.good create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.sug create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.wrong diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index f39575c704c..99f60b698d9 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -70,7 +70,7 @@ import org.apache.lucene.util.fst.Util; /** In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary. */ public class Dictionary { - // Derived from woorm/ openoffice dictionaries. + // Derived from woorm/LibreOffice dictionaries. // See TestAllDictionaries.testMaxPrologueNeeded. static final int MAX_PROLOGUE_SCAN_WINDOW = 30 * 1024; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java index 08dd0189c01..50c5dceadc0 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java @@ -21,6 +21,7 @@ import java.util.Arrays; import java.util.LinkedHashSet; import java.util.List; import java.util.Locale; +import java.util.stream.Collectors; /** A class that modifies the given misspelled word in various ways to get correct suggestions */ class ModifyingSuggester { @@ -36,12 +37,17 @@ class ModifyingSuggester { } LinkedHashSet suggest(String word, WordCase wordCase) { + String low = wordCase != WordCase.LOWER ? speller.dictionary.toLowerCase(word) : word; + if (wordCase == WordCase.UPPER || wordCase == WordCase.MIXED) { + trySuggestion(low); + } + tryVariationsOf(word); if (wordCase == WordCase.TITLE) { - tryVariationsOf(speller.dictionary.toLowerCase(word)); + tryVariationsOf(low); } else if (wordCase == WordCase.UPPER) { - tryVariationsOf(speller.dictionary.toLowerCase(word)); + tryVariationsOf(low); tryVariationsOf(speller.dictionary.toTitleCase(word)); } else if (wordCase == WordCase.MIXED) { int dot = word.indexOf('.'); @@ -51,12 +57,38 @@ class ModifyingSuggester { result.add(word.substring(0, dot + 1) + " " + word.substring(dot + 1)); } - tryVariationsOf(speller.dictionary.toLowerCase(word)); + boolean capitalized = Character.isUpperCase(word.charAt(0)); + if (capitalized) { + tryVariationsOf(speller.dictionary.caseFold(word.charAt(0)) + word.substring(1)); + } + + tryVariationsOf(low); + + if (capitalized) { + tryVariationsOf(speller.dictionary.toTitleCase(low)); + } + + return result.stream() + .map(s -> capitalizeAfterSpace(low, s)) + .collect(Collectors.toCollection(LinkedHashSet::new)); } return result; } + // aNew -> "a New" (instead of "a new") + private String capitalizeAfterSpace(String lowMisspelled, String candidate) { + int space = candidate.indexOf(' '); + int tail = candidate.length() - space - 1; + if (space > 0 + && lowMisspelled.regionMatches(lowMisspelled.length() - tail, candidate, space + 1, tail)) { + return candidate.substring(0, space + 1) + + Character.toUpperCase(candidate.charAt(space + 1)) + + candidate.substring(space + 2); + } + return candidate; + } + private void tryVariationsOf(String word) { hasGoodSuggestions |= trySuggestion(word.toUpperCase(Locale.ROOT)); hasGoodSuggestions |= tryRep(word); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java index 0c7aaa0e912..482697f060a 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java @@ -435,7 +435,7 @@ public class SpellChecker { Set result = new LinkedHashSet<>(); for (String candidate : suggestions) { - result.add(adjustSuggestionCase(candidate, wordCase)); + result.add(adjustSuggestionCase(candidate, wordCase, word)); if (wordCase == WordCase.UPPER && dictionary.checkSharpS && candidate.contains("ß")) { result.add(candidate); } @@ -443,16 +443,18 @@ public class SpellChecker { return result.stream().map(this::cleanOutput).collect(Collectors.toList()); } - private String adjustSuggestionCase(String candidate, WordCase original) { - if (original == WordCase.UPPER) { + private String adjustSuggestionCase(String candidate, WordCase originalCase, String original) { + if (originalCase == WordCase.UPPER) { String upper = candidate.toUpperCase(Locale.ROOT); if (upper.contains(" ") || spell(upper)) { return upper; } } - if (original == WordCase.UPPER || original == WordCase.TITLE) { - String title = dictionary.toTitleCase(candidate); - return spell(title) ? title : candidate; + if (Character.isUpperCase(original.charAt(0))) { + String title = Character.toUpperCase(candidate.charAt(0)) + candidate.substring(1); + if (title.contains(" ") || spell(title)) { + return title; + } } return candidate; } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java index 6ee39943886..441e5d8ce6b 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java @@ -184,6 +184,10 @@ public class SpellCheckerTest extends StemmerTestBase { doTest("sug2"); } + public void testMixedCaseSuggestionHeuristics() throws Exception { + doTest("i58202"); + } + public void testMapSuggestions() throws Exception { doTest("map"); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.aff new file mode 100644 index 00000000000..11249d4f28e --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.aff @@ -0,0 +1,4 @@ +# case suggestions +MAXNGRAMSUGS 0 +# capitalise baz->Baz +TRY B diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.dic new file mode 100644 index 00000000000..19e1980ba23 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.dic @@ -0,0 +1,5 @@ +4 +foo +bar +Baz +Boo diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.good new file mode 100644 index 00000000000..88a079a55d6 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.good @@ -0,0 +1,10 @@ +foo +bar +Foo +Bar +Baz +Boo +FOO +BAR +BAZ +BOO diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.sug new file mode 100644 index 00000000000..bc784acef94 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.sug @@ -0,0 +1,13 @@ +foo, Boo +Bar +Baz +Boo +foo bar +foo Bar +Foo bar +Foo Bar +foo Baz +Foo Baz +Baz foo +Baz Foo +Baz Boo diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.wrong new file mode 100644 index 00000000000..886584d8090 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.wrong @@ -0,0 +1,13 @@ +fOO +BAr +baz +BOo +foobar +fooBar +Foobar +FooBar +fooBaz +FooBaz +Bazfoo +BazFoo +BazBoo