mirror of https://github.com/apache/lucene.git
LUCENE-9750: Hunspell: improve suggestions for mixed-case misspelled words (#2332)
This commit is contained in:
parent
5fd18881e9
commit
c3166e1dc3
|
@ -70,7 +70,7 @@ import org.apache.lucene.util.fst.Util;
|
|||
|
||||
/** In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary. */
|
||||
public class Dictionary {
|
||||
// Derived from woorm/ openoffice dictionaries.
|
||||
// Derived from woorm/LibreOffice dictionaries.
|
||||
// See TestAllDictionaries.testMaxPrologueNeeded.
|
||||
static final int MAX_PROLOGUE_SCAN_WINDOW = 30 * 1024;
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.util.Arrays;
|
|||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/** A class that modifies the given misspelled word in various ways to get correct suggestions */
|
||||
class ModifyingSuggester {
|
||||
|
@ -36,12 +37,17 @@ class ModifyingSuggester {
|
|||
}
|
||||
|
||||
LinkedHashSet<String> suggest(String word, WordCase wordCase) {
|
||||
String low = wordCase != WordCase.LOWER ? speller.dictionary.toLowerCase(word) : word;
|
||||
if (wordCase == WordCase.UPPER || wordCase == WordCase.MIXED) {
|
||||
trySuggestion(low);
|
||||
}
|
||||
|
||||
tryVariationsOf(word);
|
||||
|
||||
if (wordCase == WordCase.TITLE) {
|
||||
tryVariationsOf(speller.dictionary.toLowerCase(word));
|
||||
tryVariationsOf(low);
|
||||
} else if (wordCase == WordCase.UPPER) {
|
||||
tryVariationsOf(speller.dictionary.toLowerCase(word));
|
||||
tryVariationsOf(low);
|
||||
tryVariationsOf(speller.dictionary.toTitleCase(word));
|
||||
} else if (wordCase == WordCase.MIXED) {
|
||||
int dot = word.indexOf('.');
|
||||
|
@ -51,12 +57,38 @@ class ModifyingSuggester {
|
|||
result.add(word.substring(0, dot + 1) + " " + word.substring(dot + 1));
|
||||
}
|
||||
|
||||
tryVariationsOf(speller.dictionary.toLowerCase(word));
|
||||
boolean capitalized = Character.isUpperCase(word.charAt(0));
|
||||
if (capitalized) {
|
||||
tryVariationsOf(speller.dictionary.caseFold(word.charAt(0)) + word.substring(1));
|
||||
}
|
||||
|
||||
tryVariationsOf(low);
|
||||
|
||||
if (capitalized) {
|
||||
tryVariationsOf(speller.dictionary.toTitleCase(low));
|
||||
}
|
||||
|
||||
return result.stream()
|
||||
.map(s -> capitalizeAfterSpace(low, s))
|
||||
.collect(Collectors.toCollection(LinkedHashSet::new));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// aNew -> "a New" (instead of "a new")
|
||||
private String capitalizeAfterSpace(String lowMisspelled, String candidate) {
|
||||
int space = candidate.indexOf(' ');
|
||||
int tail = candidate.length() - space - 1;
|
||||
if (space > 0
|
||||
&& lowMisspelled.regionMatches(lowMisspelled.length() - tail, candidate, space + 1, tail)) {
|
||||
return candidate.substring(0, space + 1)
|
||||
+ Character.toUpperCase(candidate.charAt(space + 1))
|
||||
+ candidate.substring(space + 2);
|
||||
}
|
||||
return candidate;
|
||||
}
|
||||
|
||||
private void tryVariationsOf(String word) {
|
||||
hasGoodSuggestions |= trySuggestion(word.toUpperCase(Locale.ROOT));
|
||||
hasGoodSuggestions |= tryRep(word);
|
||||
|
|
|
@ -435,7 +435,7 @@ public class SpellChecker {
|
|||
|
||||
Set<String> result = new LinkedHashSet<>();
|
||||
for (String candidate : suggestions) {
|
||||
result.add(adjustSuggestionCase(candidate, wordCase));
|
||||
result.add(adjustSuggestionCase(candidate, wordCase, word));
|
||||
if (wordCase == WordCase.UPPER && dictionary.checkSharpS && candidate.contains("ß")) {
|
||||
result.add(candidate);
|
||||
}
|
||||
|
@ -443,16 +443,18 @@ public class SpellChecker {
|
|||
return result.stream().map(this::cleanOutput).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private String adjustSuggestionCase(String candidate, WordCase original) {
|
||||
if (original == WordCase.UPPER) {
|
||||
private String adjustSuggestionCase(String candidate, WordCase originalCase, String original) {
|
||||
if (originalCase == WordCase.UPPER) {
|
||||
String upper = candidate.toUpperCase(Locale.ROOT);
|
||||
if (upper.contains(" ") || spell(upper)) {
|
||||
return upper;
|
||||
}
|
||||
}
|
||||
if (original == WordCase.UPPER || original == WordCase.TITLE) {
|
||||
String title = dictionary.toTitleCase(candidate);
|
||||
return spell(title) ? title : candidate;
|
||||
if (Character.isUpperCase(original.charAt(0))) {
|
||||
String title = Character.toUpperCase(candidate.charAt(0)) + candidate.substring(1);
|
||||
if (title.contains(" ") || spell(title)) {
|
||||
return title;
|
||||
}
|
||||
}
|
||||
return candidate;
|
||||
}
|
||||
|
|
|
@ -184,6 +184,10 @@ public class SpellCheckerTest extends StemmerTestBase {
|
|||
doTest("sug2");
|
||||
}
|
||||
|
||||
public void testMixedCaseSuggestionHeuristics() throws Exception {
|
||||
doTest("i58202");
|
||||
}
|
||||
|
||||
public void testMapSuggestions() throws Exception {
|
||||
doTest("map");
|
||||
}
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
# case suggestions
|
||||
MAXNGRAMSUGS 0
|
||||
# capitalise baz->Baz
|
||||
TRY B
|
|
@ -0,0 +1,5 @@
|
|||
4
|
||||
foo
|
||||
bar
|
||||
Baz
|
||||
Boo
|
|
@ -0,0 +1,10 @@
|
|||
foo
|
||||
bar
|
||||
Foo
|
||||
Bar
|
||||
Baz
|
||||
Boo
|
||||
FOO
|
||||
BAR
|
||||
BAZ
|
||||
BOO
|
|
@ -0,0 +1,13 @@
|
|||
foo, Boo
|
||||
Bar
|
||||
Baz
|
||||
Boo
|
||||
foo bar
|
||||
foo Bar
|
||||
Foo bar
|
||||
Foo Bar
|
||||
foo Baz
|
||||
Foo Baz
|
||||
Baz foo
|
||||
Baz Foo
|
||||
Baz Boo
|
|
@ -0,0 +1,13 @@
|
|||
fOO
|
||||
BAr
|
||||
baz
|
||||
BOo
|
||||
foobar
|
||||
fooBar
|
||||
Foobar
|
||||
FooBar
|
||||
fooBaz
|
||||
FooBaz
|
||||
Bazfoo
|
||||
BazFoo
|
||||
BazBoo
|
Loading…
Reference in New Issue