LUCENE-9750: Hunspell: improve suggestions for mixed-case misspelled words (#2332)

This commit is contained in:
Peter Gromov 2021-02-10 09:21:01 +01:00 committed by GitHub
parent 5fd18881e9
commit c3166e1dc3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 93 additions and 10 deletions

View File

@ -70,7 +70,7 @@ import org.apache.lucene.util.fst.Util;
/** In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary. */
public class Dictionary {
// Derived from woorm/ openoffice dictionaries.
// Derived from woorm/LibreOffice dictionaries.
// See TestAllDictionaries.testMaxPrologueNeeded.
static final int MAX_PROLOGUE_SCAN_WINDOW = 30 * 1024;

View File

@ -21,6 +21,7 @@ import java.util.Arrays;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.stream.Collectors;
/** A class that modifies the given misspelled word in various ways to get correct suggestions */
class ModifyingSuggester {
@ -36,12 +37,17 @@ class ModifyingSuggester {
}
LinkedHashSet<String> suggest(String word, WordCase wordCase) {
String low = wordCase != WordCase.LOWER ? speller.dictionary.toLowerCase(word) : word;
if (wordCase == WordCase.UPPER || wordCase == WordCase.MIXED) {
trySuggestion(low);
}
tryVariationsOf(word);
if (wordCase == WordCase.TITLE) {
tryVariationsOf(speller.dictionary.toLowerCase(word));
tryVariationsOf(low);
} else if (wordCase == WordCase.UPPER) {
tryVariationsOf(speller.dictionary.toLowerCase(word));
tryVariationsOf(low);
tryVariationsOf(speller.dictionary.toTitleCase(word));
} else if (wordCase == WordCase.MIXED) {
int dot = word.indexOf('.');
@ -51,12 +57,38 @@ class ModifyingSuggester {
result.add(word.substring(0, dot + 1) + " " + word.substring(dot + 1));
}
tryVariationsOf(speller.dictionary.toLowerCase(word));
boolean capitalized = Character.isUpperCase(word.charAt(0));
if (capitalized) {
tryVariationsOf(speller.dictionary.caseFold(word.charAt(0)) + word.substring(1));
}
tryVariationsOf(low);
if (capitalized) {
tryVariationsOf(speller.dictionary.toTitleCase(low));
}
return result.stream()
.map(s -> capitalizeAfterSpace(low, s))
.collect(Collectors.toCollection(LinkedHashSet::new));
}
return result;
}
// aNew -> "a New" (instead of "a new")
private String capitalizeAfterSpace(String lowMisspelled, String candidate) {
int space = candidate.indexOf(' ');
int tail = candidate.length() - space - 1;
if (space > 0
&& lowMisspelled.regionMatches(lowMisspelled.length() - tail, candidate, space + 1, tail)) {
return candidate.substring(0, space + 1)
+ Character.toUpperCase(candidate.charAt(space + 1))
+ candidate.substring(space + 2);
}
return candidate;
}
private void tryVariationsOf(String word) {
hasGoodSuggestions |= trySuggestion(word.toUpperCase(Locale.ROOT));
hasGoodSuggestions |= tryRep(word);

View File

@ -435,7 +435,7 @@ public class SpellChecker {
Set<String> result = new LinkedHashSet<>();
for (String candidate : suggestions) {
result.add(adjustSuggestionCase(candidate, wordCase));
result.add(adjustSuggestionCase(candidate, wordCase, word));
if (wordCase == WordCase.UPPER && dictionary.checkSharpS && candidate.contains("ß")) {
result.add(candidate);
}
@ -443,16 +443,18 @@ public class SpellChecker {
return result.stream().map(this::cleanOutput).collect(Collectors.toList());
}
private String adjustSuggestionCase(String candidate, WordCase original) {
if (original == WordCase.UPPER) {
private String adjustSuggestionCase(String candidate, WordCase originalCase, String original) {
if (originalCase == WordCase.UPPER) {
String upper = candidate.toUpperCase(Locale.ROOT);
if (upper.contains(" ") || spell(upper)) {
return upper;
}
}
if (original == WordCase.UPPER || original == WordCase.TITLE) {
String title = dictionary.toTitleCase(candidate);
return spell(title) ? title : candidate;
if (Character.isUpperCase(original.charAt(0))) {
String title = Character.toUpperCase(candidate.charAt(0)) + candidate.substring(1);
if (title.contains(" ") || spell(title)) {
return title;
}
}
return candidate;
}

View File

@ -184,6 +184,10 @@ public class SpellCheckerTest extends StemmerTestBase {
doTest("sug2");
}
public void testMixedCaseSuggestionHeuristics() throws Exception {
doTest("i58202");
}
public void testMapSuggestions() throws Exception {
doTest("map");
}

View File

@ -0,0 +1,4 @@
# case suggestions
MAXNGRAMSUGS 0
# capitalise baz->Baz
TRY B

View File

@ -0,0 +1,5 @@
4
foo
bar
Baz
Boo

View File

@ -0,0 +1,10 @@
foo
bar
Foo
Bar
Baz
Boo
FOO
BAR
BAZ
BOO

View File

@ -0,0 +1,13 @@
foo, Boo
Bar
Baz
Boo
foo bar
foo Bar
Foo bar
Foo Bar
foo Baz
Foo Baz
Baz foo
Baz Foo
Baz Boo

View File

@ -0,0 +1,13 @@
fOO
BAr
baz
BOo
foobar
fooBar
Foobar
FooBar
fooBaz
FooBaz
Bazfoo
BazFoo
BazBoo