LUCENE-9734: Hunspell: support suggestions based on "ph" morphological data (#2308)

This commit is contained in:
Peter Gromov 2021-02-06 17:04:12 +01:00 committed by GitHub
parent 573b442903
commit 1852d7ad5a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 321 additions and 54 deletions

View File

@ -44,6 +44,8 @@ import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@ -80,6 +82,7 @@ public class Dictionary {
// TODO: really for suffixes we should reverse the automaton and run them backwards
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
private static final Pattern MORPH_KEY_PATTERN = Pattern.compile("\\s+(?=\\p{Alpha}{2}:)");
static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET);
@ -386,8 +389,7 @@ public class Dictionary {
fullStrip = true;
} else if ("LANG".equals(firstWord)) {
language = singleArgument(reader, line);
String langCode = extractLanguageCode(language);
alternateCasing = langCode.equals("tr") || langCode.equals("az");
this.alternateCasing = hasLanguage("tr", "az");
} else if ("BREAK".equals(firstWord)) {
breaks = parseBreaks(reader, line);
} else if ("WORDCHARS".equals(firstWord)) {
@ -463,6 +465,17 @@ public class Dictionary {
stripOffsets[currentIndex] = currentOffset;
}
private boolean hasLanguage(String... langCodes) {
if (language == null) return false;
String langCode = extractLanguageCode(language);
for (String code : langCodes) {
if (langCode.equals(code)) {
return true;
}
}
return false;
}
static String extractLanguageCode(String isoCode) {
int underscore = isoCode.indexOf("_");
return underscore < 0 ? isoCode : isoCode.substring(0, underscore);
@ -910,7 +923,7 @@ public class Dictionary {
if (!hasStemExceptions) {
int morphStart = line.indexOf(MORPH_SEPARATOR);
if (morphStart >= 0 && morphStart < line.length()) {
hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
hasStemExceptions = hasStemException(line.substring(morphStart + 1));
}
}
@ -963,6 +976,23 @@ public class Dictionary {
writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
}
String toLowerCase(String word) {
char[] chars = new char[word.length()];
for (int i = 0; i < word.length(); i++) {
chars[i] = caseFold(word.charAt(i));
}
return new String(chars);
}
String toTitleCase(String word) {
char[] chars = new char[word.length()];
chars[0] = Character.toUpperCase(word.charAt(0));
for (int i = 1; i < word.length(); i++) {
chars[i] = caseFold(word.charAt(i));
}
return new String(chars);
}
private String sortWordsOffline(
Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
OfflineSorter sorter =
@ -1062,13 +1092,14 @@ public class Dictionary {
}
// we possibly have morphological data
int stemExceptionID = 0;
if (hasStemExceptions && end + 1 < line.length()) {
String stemException = parseStemException(line.substring(end + 1));
if (stemException != null) {
stemExceptions = ArrayUtil.grow(stemExceptions, stemExceptionCount + 1);
stemExceptionID =
stemExceptionCount + 1; // we use '0' to indicate no exception for the form
stemExceptions[stemExceptionCount++] = stemException;
if (end + 1 < line.length()) {
String morphData = line.substring(end + 1);
for (String datum : splitMorphData(morphData)) {
if (datum.startsWith("st:")) {
stemExceptionID = addStemException(datum.substring(3));
} else if (datum.startsWith("ph:") && datum.length() > 3) {
addPhoneticRepEntries(entry, datum.substring(3));
}
}
}
@ -1088,6 +1119,52 @@ public class Dictionary {
}
}
private int addStemException(String stemException) {
stemExceptions = ArrayUtil.grow(stemExceptions, stemExceptionCount + 1);
stemExceptions[stemExceptionCount++] = stemException;
return stemExceptionCount; // we use '0' to indicate no exception for the form
}
private void addPhoneticRepEntries(String word, String ph) {
// e.g. "pretty ph:prity ph:priti->pretti" to suggest both prity->pretty and pritier->prettiest
int arrow = ph.indexOf("->");
String pattern;
String replacement;
if (arrow > 0) {
pattern = ph.substring(0, arrow);
replacement = ph.substring(arrow + 2);
} else {
pattern = ph;
replacement = word;
}
// when the ph: field ends with *, strip last character of pattern and replacement
// e.g., "pretty ph:prity*" results in "prit->prett" replacement instead of "prity->pretty",
// to get both prity->pretty and pritiest->prettiest suggestions.
if (pattern.endsWith("*") && pattern.length() > 2 && replacement.length() > 1) {
pattern = pattern.substring(0, pattern.length() - 2);
replacement = replacement.substring(0, replacement.length() - 1);
}
// capitalize lowercase pattern for capitalized words to support
// good suggestions also for capitalized misspellings,
// e.g. Wednesday ph:wendsay results in wendsay -> Wednesday and Wendsay -> Wednesday.
if (WordCase.caseOf(word) == WordCase.TITLE && WordCase.caseOf(pattern) == WordCase.LOWER) {
// add also lowercase word in the case of German or
// Hungarian to support lowercase suggestions lowercased by
// compound word generation or derivational suffixes
// for example by adjectival suffix "-i" of geographical names in Hungarian:
// Massachusetts ph:messzecsuzec
// messzecsuzeci -> massachusettsi (adjective)
// For lowercasing by conditional PFX rules, see e.g. germancompounding test
if (hasLanguage("de", "hu")) {
repTable.add(new RepEntry(pattern, toLowerCase(replacement)));
}
repTable.add(new RepEntry(toTitleCase(pattern), replacement));
}
repTable.add(new RepEntry(pattern, replacement));
}
boolean isDotICaseChangeDisallowed(char[] word) {
return word[0] == 'İ' && !alternateCasing;
}
@ -1220,29 +1297,31 @@ public class Dictionary {
}
}
private String parseStemException(String morphData) {
private boolean hasStemException(String morphData) {
for (String datum : splitMorphData(morphData)) {
if (datum.startsWith("st:")) {
return true;
}
}
return false;
}
private List<String> splitMorphData(String morphData) {
// first see if it's an alias
if (morphAliasCount > 0) {
try {
int alias = Integer.parseInt(morphData.trim());
morphData = morphAliases[alias - 1];
} catch (NumberFormatException e) {
// fine
} catch (NumberFormatException ignored) {
}
}
// try to parse morph entry
int index = morphData.indexOf(" st:");
if (index < 0) {
index = morphData.indexOf("\tst:");
if (morphData.isBlank()) {
return Collections.emptyList();
}
if (index >= 0) {
int endIndex = indexOfSpaceOrTab(morphData, index + 1);
if (endIndex < 0) {
endIndex = morphData.length();
}
return morphData.substring(index + 4, endIndex);
}
return null;
return Arrays.stream(MORPH_KEY_PATTERN.split(morphData))
.map(String::trim)
.filter(s -> !s.isBlank())
.collect(Collectors.toList());
}
boolean isForbiddenWord(char[] word, int length, BytesRef scratch) {

View File

@ -16,9 +16,12 @@
*/
package org.apache.lucene.analysis.hunspell;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.stream.Collectors;
class ModifyingSuggester {
private static final int MAX_CHAR_DISTANCE = 4;
@ -36,6 +39,14 @@ class ModifyingSuggester {
WordCase wc = WordCase.caseOf(word);
if (wc == WordCase.UPPER) {
tryVariationsOf(speller.dictionary.toLowerCase(word));
tryVariationsOf(speller.dictionary.toTitleCase(word));
return result.stream()
.map(this::tryUpperCase)
.collect(Collectors.toCollection(LinkedHashSet::new));
}
if (wc == WordCase.MIXED) {
int dot = word.indexOf('.');
if (dot > 0
@ -44,27 +55,24 @@ class ModifyingSuggester {
result.add(word.substring(0, dot + 1) + " " + word.substring(dot + 1));
}
tryVariationsOf(toLowerCase(word));
tryVariationsOf(speller.dictionary.toLowerCase(word));
}
return result;
}
private String toLowerCase(String word) {
char[] chars = new char[word.length()];
for (int i = 0; i < word.length(); i++) {
chars[i] = speller.dictionary.caseFold(word.charAt(i));
private String tryUpperCase(String candidate) {
String upper = candidate.toUpperCase(Locale.ROOT);
if (upper.contains(" ") || speller.spell(upper)) {
return upper;
}
return new String(chars);
String title = speller.dictionary.toTitleCase(candidate);
return speller.spell(title) ? title : candidate;
}
private void tryVariationsOf(String word) {
trySuggestion(word.toUpperCase(Locale.ROOT));
if (checkDictionaryForSplitSuggestions(word)) {
return;
}
tryRep(word);
boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
hasGoodSuggestions |= tryRep(word);
trySwappingChars(word);
tryLongSwap(word);
@ -75,12 +83,24 @@ class ModifyingSuggester {
tryReplacingChar(word);
tryTwoDuplicateChars(word);
if (speller.dictionary.enableSplitSuggestions) {
List<String> goodSplit = checkDictionaryForSplitSuggestions(word);
if (!goodSplit.isEmpty()) {
List<String> copy = new ArrayList<>(result);
result.clear();
result.addAll(goodSplit);
if (hasGoodSuggestions) {
result.addAll(copy);
}
hasGoodSuggestions = true;
}
if (!hasGoodSuggestions && speller.dictionary.enableSplitSuggestions) {
trySplitting(word);
}
}
private void tryRep(String word) {
private boolean tryRep(String word) {
int before = result.size();
for (RepEntry entry : speller.dictionary.repTable) {
for (String candidate : entry.substitute(word)) {
if (trySuggestion(candidate)) {
@ -88,11 +108,16 @@ class ModifyingSuggester {
}
if (candidate.contains(" ")
&& Arrays.stream(candidate.split(" ")).allMatch(speller::checkWord)) {
&& Arrays.stream(candidate.split(" ")).allMatch(this::checkSimpleWord)) {
result.add(candidate);
}
}
}
return result.size() > before;
}
private boolean checkSimpleWord(String part) {
return Boolean.TRUE.equals(speller.checkSimpleWord(part.toCharArray(), part.length(), null));
}
private void trySwappingChars(String word) {
@ -213,24 +238,30 @@ class ModifyingSuggester {
}
}
private boolean checkDictionaryForSplitSuggestions(String word) {
boolean found = false;
private List<String> checkDictionaryForSplitSuggestions(String word) {
List<String> result = new ArrayList<>();
for (int i = 1; i < word.length() - 1; i++) {
String w1 = word.substring(0, i);
String w2 = word.substring(i);
found |= trySuggestion(w1 + " " + w2);
String spaced = w1 + " " + w2;
if (speller.checkWord(spaced)) {
result.add(spaced);
}
if (shouldSplitByDash()) {
found |= trySuggestion(w1 + "-" + w2);
String dashed = w1 + "-" + w2;
if (speller.checkWord(dashed)) {
result.add(dashed);
}
}
}
return found;
return result;
}
private void trySplitting(String word) {
for (int i = 1; i < word.length() - 1; i++) {
String w1 = word.substring(0, i);
String w2 = word.substring(i);
if (speller.checkWord(w1) && speller.checkWord(w2)) {
if (checkSimpleWord(w1) && checkSimpleWord(w2)) {
result.add(w1 + " " + w2);
if (shouldSplitByDash()) {
result.add(w1 + "-" + w2);
@ -244,10 +275,6 @@ class ModifyingSuggester {
}
private boolean trySuggestion(String candidate) {
if (speller.checkWord(candidate)) {
result.add(candidate);
return true;
}
return false;
return speller.checkWord(candidate) && result.add(candidate);
}
}

View File

@ -134,7 +134,7 @@ public class SpellChecker {
return checkWord(word.toCharArray(), word.length(), null);
}
private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) {
if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
return false;
}
@ -143,6 +143,15 @@ public class SpellChecker {
return true;
}
return null;
}
private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
Boolean simpleResult = checkSimpleWord(wordChars, length, originalCase);
if (simpleResult != null) {
return simpleResult;
}
if (dictionary.compoundRules != null
&& checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
return true;

View File

@ -44,10 +44,18 @@ public class SpellCheckerTest extends StemmerTestBase {
doTest("allcaps");
}
public void rep() throws Exception {
public void testRepSuggestions() throws Exception {
doTest("rep");
}
public void testPhSuggestions() throws Exception {
doTest("ph");
}
public void testPhSuggestions2() throws Exception {
doTest("ph2");
}
public void testForceUCase() throws Exception {
doTest("forceucase");
}

View File

@ -0,0 +1,30 @@
# new suggestion methods of Hunspell 1.7:
# ph: for dictionary-based suggestions.
#
# For example, suggestions for "wich"
# with this test dictonary:
#
# Hunspell 1.3.3
# wich
# & wich 4 0: winch, witch, which, wish
#
# Hunspell 1.6.2
# wich
# & wich 4 0: which, witch, winch, wish
#
# Suggestions will be limited for
# the dictionary words with the same ph: field,
# and for non-ngram suggestions.
#
# Order of the ph: suggestions for the
# same mispelling, eg. wich -> which, witch
# follows the order of the words in the dictionary:
#
# which ph:wich
# witch ph:witch
#
# switch off ngram suggestions to check only
# ph: based suggestions
MAXNGRAMSUGS 0
TRY esianrtolcdugmphbyfvkwzESIANRTOLCDUGMPHBYFVKWZ'-

View File

@ -0,0 +1,11 @@
8
a lot ph:alot
in spite ph:inspite
inspire
what ph:whta ph:waht
Wednesday ph:wendsay ph:wensday
which ph:wich
witch ph:wich
winch
wish
Oh, my gosh! ph:omg

View File

@ -0,0 +1,11 @@
a lot
in spite, inspire
what
what
Wednesday
Wednesday
Wednesday
Wednesday
which, witch, winch, wish
Oh, my gosh!
OH, MY GOSH!

View File

@ -0,0 +1,11 @@
alot
inspite
whta
waht
wensday
wendsay
Wensday
Wendsay
wich
omg
OMG

View File

@ -0,0 +1,32 @@
# switch off ngram suggestion to test
# usage of ph: dictionary fields in REP
# suggestions
SET UTF-8
MAXNGRAMSUGS 0
# test in compounds, too
COMPOUNDFLAG Y
# test also dictionary items with space,
# and forbidden compounding, if there is
# a ph: field with that compound as
# mispelling in the dictionary
CHECKCOMPOUNDREP
# test in compound word with affixes
SFX A Y 1
SFX A 0 's .
# when the ph: field ends with the character *,
# strip last character of the pattern and the replacement
# to match in REP suggestions also at character changes,
# for example, "pretty ph:prity*" results "prit->prett"
# REP replacement instead of "prity->pretty", to get
# prity->pretty and pritiest->prettiest suggestions.
SFX B Y 2
SFX B y iest [^aeiou]y
SFX B ö őt ö
WORDCHARS '

View File

@ -0,0 +1,11 @@
9
foo ph:bar ph:baz
foo bar ph:foobar
word/Y ph:baz
stem/Y ph: ph:
forbidden/Y
root/YA
forbidden root/A ph:forbiddenroot
pretty/B ph:prity*
foobarö/B ph:fubarő*
happy/B ph:hepy ph:hepi->happi

View File

@ -0,0 +1,9 @@
foo
word
stem
wordstem
stemword
rootforbidden
root's
foobarö
foobarőt

View File

@ -0,0 +1,14 @@
foo
foo, word
foo bar
wordstem
stemword
stemwordstem
forbidden root
forbidden root's
pretty
prettiest
foobarö
foobarőt
happy
happiest

View File

@ -0,0 +1,15 @@
bar
baz
foobar
bazstem
stembaz
stembazstem
forbiddenroot
forbiddenroot's
rootforbiddenroot
prity
pritiest
fubarö
fubarőt
hepy
hepiest

View File

@ -10,7 +10,7 @@ REP shun$ tion
REP ^alot$ a_lot # add the highest priority for "a lot" suggestion to "alot"
REP ^foo$ bar
REP ' _ # "un'alunno" -> "un alunno"
REP ^vinte<EFBFBD>n$ vinte_e_un
REP ^vinteún$ vinte_e_un
REP s 's