mirror of https://github.com/apache/lucene.git
LUCENE-9736: Hunspell: support MAP-based suggestions for groups of similar letters (#2314)
This commit is contained in:
parent
061233ca4e
commit
653626399f
|
@ -168,6 +168,7 @@ public class Dictionary {
|
||||||
String[] neighborKeyGroups = new String[0];
|
String[] neighborKeyGroups = new String[0];
|
||||||
boolean enableSplitSuggestions = true;
|
boolean enableSplitSuggestions = true;
|
||||||
List<RepEntry> repTable = new ArrayList<>();
|
List<RepEntry> repTable = new ArrayList<>();
|
||||||
|
List<List<String>> mapTable = new ArrayList<>();
|
||||||
|
|
||||||
// FSTs used for ICONV/OCONV, output ord pointing to replacement text
|
// FSTs used for ICONV/OCONV, output ord pointing to replacement text
|
||||||
FST<CharsRef> iconv;
|
FST<CharsRef> iconv;
|
||||||
|
@ -399,6 +400,11 @@ public class Dictionary {
|
||||||
String[] parts = splitBySpace(reader, reader.readLine(), 3, Integer.MAX_VALUE);
|
String[] parts = splitBySpace(reader, reader.readLine(), 3, Integer.MAX_VALUE);
|
||||||
repTable.add(new RepEntry(parts[1], parts[2]));
|
repTable.add(new RepEntry(parts[1], parts[2]));
|
||||||
}
|
}
|
||||||
|
} else if ("MAP".equals(firstWord)) {
|
||||||
|
int count = parseNum(reader, line);
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
mapTable.add(parseMapEntry(reader, reader.readLine()));
|
||||||
|
}
|
||||||
} else if ("KEY".equals(firstWord)) {
|
} else if ("KEY".equals(firstWord)) {
|
||||||
neighborKeyGroups = singleArgument(reader, line).split("\\|");
|
neighborKeyGroups = singleArgument(reader, line).split("\\|");
|
||||||
} else if ("NOSPLITSUGS".equals(firstWord)) {
|
} else if ("NOSPLITSUGS".equals(firstWord)) {
|
||||||
|
@ -462,6 +468,25 @@ public class Dictionary {
|
||||||
stripOffsets[currentIndex] = currentOffset;
|
stripOffsets[currentIndex] = currentOffset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private List<String> parseMapEntry(LineNumberReader reader, String line) throws ParseException {
|
||||||
|
String unparsed = singleArgument(reader, line);
|
||||||
|
List<String> mapEntry = new ArrayList<>();
|
||||||
|
for (int j = 0; j < unparsed.length(); j++) {
|
||||||
|
if (unparsed.charAt(j) == '(') {
|
||||||
|
int closing = unparsed.indexOf(')', j);
|
||||||
|
if (closing < 0) {
|
||||||
|
throw new ParseException("Unclosed parenthesis: " + line, reader.getLineNumber());
|
||||||
|
}
|
||||||
|
|
||||||
|
mapEntry.add(unparsed.substring(j + 1, closing));
|
||||||
|
j = closing;
|
||||||
|
} else {
|
||||||
|
mapEntry.add(String.valueOf(unparsed.charAt(j)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return mapEntry;
|
||||||
|
}
|
||||||
|
|
||||||
private boolean hasLanguage(String... langCodes) {
|
private boolean hasLanguage(String... langCodes) {
|
||||||
if (language == null) return false;
|
if (language == null) return false;
|
||||||
String langCode = extractLanguageCode(language);
|
String langCode = extractLanguageCode(language);
|
||||||
|
|
|
@ -74,6 +74,10 @@ class ModifyingSuggester {
|
||||||
boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
|
boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
|
||||||
hasGoodSuggestions |= tryRep(word);
|
hasGoodSuggestions |= tryRep(word);
|
||||||
|
|
||||||
|
if (!speller.dictionary.mapTable.isEmpty()) {
|
||||||
|
enumerateMapReplacements(word, "", 0);
|
||||||
|
}
|
||||||
|
|
||||||
trySwappingChars(word);
|
trySwappingChars(word);
|
||||||
tryLongSwap(word);
|
tryLongSwap(word);
|
||||||
tryNeighborKeys(word);
|
tryNeighborKeys(word);
|
||||||
|
@ -116,6 +120,27 @@ class ModifyingSuggester {
|
||||||
return result.size() > before;
|
return result.size() > before;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void enumerateMapReplacements(String word, String accumulated, int offset) {
|
||||||
|
if (offset == word.length()) {
|
||||||
|
trySuggestion(accumulated);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (List<String> entries : speller.dictionary.mapTable) {
|
||||||
|
for (String entry : entries) {
|
||||||
|
if (word.regionMatches(offset, entry, 0, entry.length())) {
|
||||||
|
for (String replacement : entries) {
|
||||||
|
if (!entry.equals(replacement)) {
|
||||||
|
enumerateMapReplacements(word, accumulated + replacement, offset + entry.length());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
enumerateMapReplacements(word, accumulated + word.charAt(offset), offset + 1);
|
||||||
|
}
|
||||||
|
|
||||||
private boolean checkSimpleWord(String part) {
|
private boolean checkSimpleWord(String part) {
|
||||||
return Boolean.TRUE.equals(speller.checkSimpleWord(part.toCharArray(), part.length(), null));
|
return Boolean.TRUE.equals(speller.checkSimpleWord(part.toCharArray(), part.length(), null));
|
||||||
}
|
}
|
||||||
|
|
|
@ -180,6 +180,10 @@ public class SpellCheckerTest extends StemmerTestBase {
|
||||||
doTest("sug2");
|
doTest("sug2");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testMapSuggestions() throws Exception {
|
||||||
|
doTest("map");
|
||||||
|
}
|
||||||
|
|
||||||
protected void doTest(String name) throws Exception {
|
protected void doTest(String name) throws Exception {
|
||||||
checkSpellCheckerExpectations(
|
checkSpellCheckerExpectations(
|
||||||
Path.of(getClass().getResource(name + ".aff").toURI()).getParent().resolve(name), true);
|
Path.of(getClass().getResource(name + ".aff").toURI()).getParent().resolve(name), true);
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
# With MAP suggestion, Hunspell can add missing accents to a word.
|
||||||
|
|
||||||
|
# switch off ngram suggestion for testing
|
||||||
|
MAXNGRAMSUGS 0
|
||||||
|
|
||||||
|
MAP 3
|
||||||
|
MAP uúü
|
||||||
|
MAP oóö
|
||||||
|
MAP ß(ss)
|
|
@ -0,0 +1,4 @@
|
||||||
|
3
|
||||||
|
Fr<EFBFBD>hst<EFBFBD>ck
|
||||||
|
t<EFBFBD>k<EFBFBD>rf<EFBFBD>r<EFBFBD>
|
||||||
|
gro<EFBFBD>
|
|
@ -0,0 +1,3 @@
|
||||||
|
Frühstück
|
||||||
|
tükörfúró
|
||||||
|
groß
|
|
@ -0,0 +1,3 @@
|
||||||
|
Fruhstuck
|
||||||
|
tukorfuro
|
||||||
|
gross
|
Loading…
Reference in New Issue