LUCENE-9808: Hunspell suggestions: consider space/dash-separated words for each case variation (#2425)

This commit is contained in:
Peter Gromov 2021-02-24 17:43:37 +01:00 committed by GitHub
parent 9d6fd98810
commit e1ff4c1354
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 29 additions and 16 deletions

View File

@ -565,9 +565,9 @@ public class Hunspell {
}
};
ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller, suggestions);
modifier.suggest(word, wordCase);
boolean hasGoodSuggestions = modifier.suggest(word, wordCase);
if (!modifier.hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
suggestions.addAll(
new GeneratingSuggester(suggestionSpeller)
.suggest(dictionary.toLowerCase(word), wordCase, suggestions));

View File

@ -21,7 +21,6 @@ import java.util.Arrays;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.stream.Collectors;
/** A class that modifies the given misspelled word in various ways to get correct suggestions */
class ModifyingSuggester {
@ -29,7 +28,6 @@ class ModifyingSuggester {
private final LinkedHashSet<String> result;
private final char[] tryChars;
private final Hunspell speller;
boolean hasGoodSuggestions;
ModifyingSuggester(Hunspell speller, LinkedHashSet<String> result) {
this.speller = speller;
@ -37,19 +35,20 @@ class ModifyingSuggester {
this.result = result;
}
void suggest(String word, WordCase wordCase) {
/** @return whether any of the added suggestions are considered "good" */
boolean suggest(String word, WordCase wordCase) {
String low = wordCase != WordCase.LOWER ? speller.dictionary.toLowerCase(word) : word;
if (wordCase == WordCase.UPPER || wordCase == WordCase.MIXED) {
trySuggestion(low);
}
tryVariationsOf(word);
boolean hasGoodSuggestions = tryVariationsOf(word);
if (wordCase == WordCase.TITLE) {
tryVariationsOf(low);
hasGoodSuggestions |= tryVariationsOf(low);
} else if (wordCase == WordCase.UPPER) {
tryVariationsOf(low);
tryVariationsOf(speller.dictionary.toTitleCase(word));
hasGoodSuggestions |= tryVariationsOf(low);
hasGoodSuggestions |= tryVariationsOf(speller.dictionary.toTitleCase(word));
} else if (wordCase == WordCase.MIXED) {
int dot = word.indexOf('.');
if (dot > 0
@ -60,20 +59,26 @@ class ModifyingSuggester {
boolean capitalized = Character.isUpperCase(word.charAt(0));
if (capitalized) {
tryVariationsOf(speller.dictionary.caseFold(word.charAt(0)) + word.substring(1));
hasGoodSuggestions |=
tryVariationsOf(speller.dictionary.caseFold(word.charAt(0)) + word.substring(1));
}
tryVariationsOf(low);
hasGoodSuggestions |= tryVariationsOf(low);
if (capitalized) {
tryVariationsOf(speller.dictionary.toTitleCase(low));
hasGoodSuggestions |= tryVariationsOf(speller.dictionary.toTitleCase(low));
}
List<String> adjusted = new ArrayList<>();
for (String candidate : result) {
String s = capitalizeAfterSpace(word, candidate);
adjusted.add(s.equals(candidate) ? adjusted.size() : 0, s);
}
List<String> adjusted =
result.stream().map(s -> capitalizeAfterSpace(word, s)).collect(Collectors.toList());
result.clear();
result.addAll(adjusted);
}
return hasGoodSuggestions;
}
// aNew -> "a New" (instead of "a new")
@ -89,8 +94,8 @@ class ModifyingSuggester {
return candidate;
}
private void tryVariationsOf(String word) {
hasGoodSuggestions |= trySuggestion(word.toUpperCase(Locale.ROOT));
private boolean tryVariationsOf(String word) {
boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
hasGoodSuggestions |= tryRep(word);
if (!speller.dictionary.mapTable.isEmpty()) {
@ -120,6 +125,7 @@ class ModifyingSuggester {
if (!hasGoodSuggestions && speller.dictionary.enableSplitSuggestions) {
trySplitting(word);
}
return hasGoodSuggestions;
}
private boolean tryRep(String word) {

View File

@ -23,3 +23,5 @@ TRY esianrtolcdugmphbyfvkwz'
WORDCHARS .-
FORBIDDENWORD ?
REP 1
REP s ti

View File

@ -10,3 +10,6 @@ inspire
scot
free
scot-free
Sm
es
times

View File

@ -1,3 +1,4 @@
a lot
in spite
scot-free
Sm Es, Times, Sm-es

View File

@ -1,3 +1,4 @@
alot
inspite
scotfree
SMEs