mirror of https://github.com/apache/lucene.git
LUCENE-9720: Hunspell: more ways to vary misspelled word variations for suggestions (#2286)
This commit is contained in:
parent
d88264ba72
commit
a79f641561
|
@ -152,6 +152,8 @@ public class Dictionary {
|
|||
private char[] ignore;
|
||||
|
||||
String tryChars = "";
|
||||
String[] neighborKeyGroups = new String[0];
|
||||
boolean enableSplitSuggestions = true;
|
||||
List<RepEntry> repTable = new ArrayList<>();
|
||||
|
||||
// FSTs used for ICONV/OCONV, output ord pointing to replacement text
|
||||
|
@ -385,6 +387,10 @@ public class Dictionary {
|
|||
String[] parts = splitBySpace(reader, reader.readLine(), 3);
|
||||
repTable.add(new RepEntry(parts[1], parts[2]));
|
||||
}
|
||||
} else if ("KEY".equals(firstWord)) {
|
||||
neighborKeyGroups = singleArgument(reader, line).split("\\|");
|
||||
} else if ("NOSPLITSUGS".equals(firstWord)) {
|
||||
enableSplitSuggestions = false;
|
||||
} else if ("FORBIDDENWORD".equals(firstWord)) {
|
||||
forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("COMPOUNDMIN".equals(firstWord)) {
|
||||
|
|
|
@ -18,8 +18,10 @@ package org.apache.lucene.analysis.hunspell;
|
|||
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Locale;
|
||||
|
||||
class ModifyingSuggester {
|
||||
private static final int MAX_CHAR_DISTANCE = 4;
|
||||
private final LinkedHashSet<String> result = new LinkedHashSet<>();
|
||||
private final char[] tryChars;
|
||||
private final SpellChecker speller;
|
||||
|
@ -30,11 +32,54 @@ class ModifyingSuggester {
|
|||
}
|
||||
|
||||
LinkedHashSet<String> suggest(String word) {
|
||||
tryRep(word);
|
||||
tryAddingChar(word);
|
||||
tryVariationsOf(word);
|
||||
|
||||
WordCase wc = WordCase.caseOf(word);
|
||||
|
||||
if (wc == WordCase.MIXED) {
|
||||
int dot = word.indexOf('.');
|
||||
if (dot > 0
|
||||
&& dot < word.length() - 1
|
||||
&& WordCase.caseOf(word.substring(dot + 1)) == WordCase.TITLE) {
|
||||
result.add(word.substring(0, dot + 1) + " " + word.substring(dot + 1));
|
||||
}
|
||||
|
||||
tryVariationsOf(toLowerCase(word));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private String toLowerCase(String word) {
|
||||
char[] chars = new char[word.length()];
|
||||
for (int i = 0; i < word.length(); i++) {
|
||||
chars[i] = speller.dictionary.caseFold(word.charAt(i));
|
||||
}
|
||||
return new String(chars);
|
||||
}
|
||||
|
||||
private void tryVariationsOf(String word) {
|
||||
trySuggestion(word.toUpperCase(Locale.ROOT));
|
||||
if (checkDictionaryForSplitSuggestions(word)) {
|
||||
return;
|
||||
}
|
||||
|
||||
tryRep(word);
|
||||
|
||||
trySwappingChars(word);
|
||||
tryLongSwap(word);
|
||||
tryNeighborKeys(word);
|
||||
tryRemovingChar(word);
|
||||
tryAddingChar(word);
|
||||
tryMovingChar(word);
|
||||
tryReplacingChar(word);
|
||||
tryTwoDuplicateChars(word);
|
||||
|
||||
if (speller.dictionary.enableSplitSuggestions) {
|
||||
trySplitting(word);
|
||||
}
|
||||
}
|
||||
|
||||
private void tryRep(String word) {
|
||||
for (RepEntry entry : speller.dictionary.repTable) {
|
||||
for (String candidate : entry.substitute(word)) {
|
||||
|
@ -50,6 +95,75 @@ class ModifyingSuggester {
|
|||
}
|
||||
}
|
||||
|
||||
private void trySwappingChars(String word) {
|
||||
int length = word.length();
|
||||
for (int i = 0; i < length - 1; i++) {
|
||||
char c1 = word.charAt(i);
|
||||
char c2 = word.charAt(i + 1);
|
||||
trySuggestion(word.substring(0, i) + c2 + c1 + word.substring(i + 2));
|
||||
}
|
||||
|
||||
if (length == 4 || length == 5) {
|
||||
tryDoubleSwapForShortWords(word, length);
|
||||
}
|
||||
}
|
||||
|
||||
// ahev -> have, owudl -> would
|
||||
private void tryDoubleSwapForShortWords(String word, int length) {
|
||||
char[] candidate = word.toCharArray();
|
||||
candidate[0] = word.charAt(1);
|
||||
candidate[1] = word.charAt(0);
|
||||
candidate[length - 1] = word.charAt(length - 2);
|
||||
candidate[length - 2] = word.charAt(length - 1);
|
||||
trySuggestion(new String(candidate));
|
||||
|
||||
if (candidate.length == 5) {
|
||||
candidate[0] = word.charAt(0);
|
||||
candidate[1] = word.charAt(2);
|
||||
candidate[2] = word.charAt(1);
|
||||
trySuggestion(new String(candidate));
|
||||
}
|
||||
}
|
||||
|
||||
private void tryNeighborKeys(String word) {
|
||||
for (int i = 0; i < word.length(); i++) {
|
||||
char c = word.charAt(i);
|
||||
char up = Character.toUpperCase(c);
|
||||
if (up != c) {
|
||||
trySuggestion(word.substring(0, i) + up + word.substring(i + 1));
|
||||
}
|
||||
|
||||
// check neighbor characters in keyboard string
|
||||
for (String group : speller.dictionary.neighborKeyGroups) {
|
||||
if (group.indexOf(c) >= 0) {
|
||||
for (int j = 0; j < group.length(); j++) {
|
||||
if (group.charAt(j) != c) {
|
||||
trySuggestion(word.substring(0, i) + group.charAt(j) + word.substring(i + 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void tryLongSwap(String word) {
|
||||
for (int i = 0; i < word.length(); i++) {
|
||||
for (int j = i + 2; j < word.length() && j <= i + MAX_CHAR_DISTANCE; j++) {
|
||||
char c1 = word.charAt(i);
|
||||
char c2 = word.charAt(j);
|
||||
String prefix = word.substring(0, i);
|
||||
String suffix = word.substring(j + 1);
|
||||
trySuggestion(prefix + c2 + word.substring(i + 1, j) + c1 + suffix);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void tryRemovingChar(String word) {
|
||||
for (int i = 0; i < word.length(); i++) {
|
||||
trySuggestion(word.substring(0, i) + word.substring(i + 1));
|
||||
}
|
||||
}
|
||||
|
||||
private void tryAddingChar(String word) {
|
||||
for (int i = 0; i <= word.length(); i++) {
|
||||
String prefix = word.substring(0, i);
|
||||
|
@ -60,6 +174,75 @@ class ModifyingSuggester {
|
|||
}
|
||||
}
|
||||
|
||||
private void tryMovingChar(String word) {
|
||||
for (int i = 0; i < word.length(); i++) {
|
||||
for (int j = i + 2; j < word.length() && j <= i + MAX_CHAR_DISTANCE; j++) {
|
||||
String prefix = word.substring(0, i);
|
||||
trySuggestion(prefix + word.substring(i + 1, j) + word.charAt(i) + word.substring(j));
|
||||
trySuggestion(prefix + word.charAt(j) + word.substring(i, j) + word.substring(j + 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void tryReplacingChar(String word) {
|
||||
for (int i = 0; i < word.length(); i++) {
|
||||
String prefix = word.substring(0, i);
|
||||
String suffix = word.substring(i + 1);
|
||||
for (char toInsert : tryChars) {
|
||||
if (toInsert != word.charAt(i)) {
|
||||
trySuggestion(prefix + toInsert + suffix);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// perhaps we doubled two characters
|
||||
// (for example vacation -> vacacation)
|
||||
private void tryTwoDuplicateChars(String word) {
|
||||
int dupLen = 0;
|
||||
for (int i = 2; i < word.length(); i++) {
|
||||
if (word.charAt(i) == word.charAt(i - 2)) {
|
||||
dupLen++;
|
||||
if (dupLen == 3 || dupLen == 2 && i >= 4) {
|
||||
trySuggestion(word.substring(0, i - 1) + word.substring(i + 1));
|
||||
dupLen = 0;
|
||||
}
|
||||
} else {
|
||||
dupLen = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean checkDictionaryForSplitSuggestions(String word) {
|
||||
boolean found = false;
|
||||
for (int i = 1; i < word.length() - 1; i++) {
|
||||
String w1 = word.substring(0, i);
|
||||
String w2 = word.substring(i);
|
||||
found |= trySuggestion(w1 + " " + w2);
|
||||
if (shouldSplitByDash()) {
|
||||
found |= trySuggestion(w1 + "-" + w2);
|
||||
}
|
||||
}
|
||||
return found;
|
||||
}
|
||||
|
||||
private void trySplitting(String word) {
|
||||
for (int i = 1; i < word.length() - 1; i++) {
|
||||
String w1 = word.substring(0, i);
|
||||
String w2 = word.substring(i);
|
||||
if (speller.checkWord(w1) && speller.checkWord(w2)) {
|
||||
result.add(w1 + " " + w2);
|
||||
if (shouldSplitByDash()) {
|
||||
result.add(w1 + "-" + w2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean shouldSplitByDash() {
|
||||
return speller.dictionary.tryChars.contains("-") || speller.dictionary.tryChars.contains("a");
|
||||
}
|
||||
|
||||
private boolean trySuggestion(String candidate) {
|
||||
if (speller.checkWord(candidate)) {
|
||||
result.add(candidate);
|
||||
|
|
|
@ -414,7 +414,10 @@ public class SpellChecker {
|
|||
String chunk = word.substring(chunkStart, chunkEnd);
|
||||
if (!spell(chunk)) {
|
||||
for (String chunkSug : suggest(chunk)) {
|
||||
result.add(word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd));
|
||||
String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd);
|
||||
if (!dictionary.isForbiddenWord(replaced.toCharArray(), replaced.length(), scratch)) {
|
||||
result.add(replaced);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -37,6 +37,10 @@ enum WordCase {
|
|||
return get(startsWithLower, seenUpper, seenLower);
|
||||
}
|
||||
|
||||
static WordCase caseOf(CharSequence word) {
|
||||
return caseOf(word, word.length());
|
||||
}
|
||||
|
||||
static WordCase caseOf(CharSequence word, int length) {
|
||||
boolean startsWithLower = Character.isLowerCase(word.charAt(0));
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
IJs, ijs
|
|
@ -156,6 +156,14 @@ public class SpellCheckerTest extends StemmerTestBase {
|
|||
doTest("germancompounding");
|
||||
}
|
||||
|
||||
public void testModifyingSuggestions() throws Exception {
|
||||
doTest("sug");
|
||||
}
|
||||
|
||||
public void testModifyingSuggestions2() throws Exception {
|
||||
doTest("sug2");
|
||||
}
|
||||
|
||||
protected void doTest(String name) throws Exception {
|
||||
checkSpellCheckerExpectations(
|
||||
Path.of(getClass().getResource(name + ".aff").toURI()).getParent().resolve(name), true);
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
# new suggestion methods of Hunspell 1.5:
|
||||
# capitalization: nasa -> NASA
|
||||
# long swap: permenant -> permanent
|
||||
# long mov: Ghandi -> Gandhi
|
||||
# double two characters: vacacation -> vacation
|
||||
# space with REP: "alot" -> "a lot" ("a lot" need to be in the dic file.)
|
||||
#
|
||||
# Note: see test "ph" for the newer and
|
||||
# more simple method to handle common misspellings,
|
||||
# for example, alot->a lot, inspite->in spite,
|
||||
# (that is giving the best suggestion, and limiting
|
||||
# ngram/phonetic suggestion)
|
||||
|
||||
# switch off ngram suggestion for testing
|
||||
MAXNGRAMSUGS 0
|
||||
REP 2
|
||||
REP alot a_lot
|
||||
REP inspite in_spite
|
||||
KEY qwertzuiop|asdfghjkl|yxcvbnm|aq
|
||||
WORDCHARS .-
|
||||
FORBIDDENWORD ?
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
13
|
||||
NASA
|
||||
Gandhi
|
||||
grateful
|
||||
permanent
|
||||
vacation
|
||||
a
|
||||
lot
|
||||
have
|
||||
which
|
||||
McDonald
|
||||
permanent-vacation/?
|
||||
in
|
||||
spite
|
||||
inspire
|
|
@ -0,0 +1,15 @@
|
|||
NASA
|
||||
Gandhi
|
||||
grateful
|
||||
permanent
|
||||
vacation
|
||||
a lot, lot
|
||||
in spite, inspire
|
||||
permanent. Vacation
|
||||
have
|
||||
which
|
||||
Gandhi
|
||||
McDonald
|
||||
permanent
|
||||
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
nasa
|
||||
Ghandi
|
||||
greatful
|
||||
permenant
|
||||
vacacation
|
||||
alot
|
||||
inspite
|
||||
permanent.Vacation
|
||||
ahev
|
||||
hwihc
|
||||
GAndhi
|
||||
Mcdonald
|
||||
permqnent
|
||||
permanent-vacation
|
||||
permqnent-vacation
|
|
@ -0,0 +1,25 @@
|
|||
# new suggestion methods of Hunspell 1.7:
|
||||
# dictionary word pairs with spaces or dashes
|
||||
# got top priority, and removes other not
|
||||
# "good" (uppercase, REP, ph:) suggestions:
|
||||
#
|
||||
# "alot" -> "a lot"
|
||||
#
|
||||
# Note: use ph: at the dictionary word pair
|
||||
# with space or dash to keep the other not
|
||||
# "good" suggestions, for example
|
||||
#
|
||||
# a lot ph:alot
|
||||
#
|
||||
# results "alot" -> "a lot", "alto", "slot"...
|
||||
|
||||
# switch off ngram suggestion for testing
|
||||
MAXNGRAMSUGS 0
|
||||
KEY qwertzuiop|asdfghjkl|yxcvbnm|aq
|
||||
|
||||
# Note: TRY with a letter "a" or "-" needs for
|
||||
# checking dictionary word pairs with dashes
|
||||
TRY esianrtolcdugmphbyfvkwz'
|
||||
WORDCHARS .-
|
||||
FORBIDDENWORD ?
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
11
|
||||
a
|
||||
lot
|
||||
a lot
|
||||
alto
|
||||
in
|
||||
spite
|
||||
in spite
|
||||
inspire
|
||||
scot
|
||||
free
|
||||
scot-free
|
|
@ -0,0 +1,3 @@
|
|||
a lot
|
||||
in spite
|
||||
scot-free
|
|
@ -0,0 +1,3 @@
|
|||
alot
|
||||
inspite
|
||||
scotfree
|
Loading…
Reference in New Issue