mirror of https://github.com/apache/lucene.git
LUCENE-9734: Hunspell: support suggestions based on "ph" morphological data (#2308)
This commit is contained in:
parent
573b442903
commit
1852d7ad5a
|
@ -44,6 +44,8 @@ import java.util.Locale;
|
|||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
|
@ -80,6 +82,7 @@ public class Dictionary {
|
|||
// TODO: really for suffixes we should reverse the automaton and run them backwards
|
||||
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
|
||||
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
|
||||
private static final Pattern MORPH_KEY_PATTERN = Pattern.compile("\\s+(?=\\p{Alpha}{2}:)");
|
||||
static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
|
||||
CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET);
|
||||
|
||||
|
@ -386,8 +389,7 @@ public class Dictionary {
|
|||
fullStrip = true;
|
||||
} else if ("LANG".equals(firstWord)) {
|
||||
language = singleArgument(reader, line);
|
||||
String langCode = extractLanguageCode(language);
|
||||
alternateCasing = langCode.equals("tr") || langCode.equals("az");
|
||||
this.alternateCasing = hasLanguage("tr", "az");
|
||||
} else if ("BREAK".equals(firstWord)) {
|
||||
breaks = parseBreaks(reader, line);
|
||||
} else if ("WORDCHARS".equals(firstWord)) {
|
||||
|
@ -463,6 +465,17 @@ public class Dictionary {
|
|||
stripOffsets[currentIndex] = currentOffset;
|
||||
}
|
||||
|
||||
private boolean hasLanguage(String... langCodes) {
|
||||
if (language == null) return false;
|
||||
String langCode = extractLanguageCode(language);
|
||||
for (String code : langCodes) {
|
||||
if (langCode.equals(code)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static String extractLanguageCode(String isoCode) {
|
||||
int underscore = isoCode.indexOf("_");
|
||||
return underscore < 0 ? isoCode : isoCode.substring(0, underscore);
|
||||
|
@ -910,7 +923,7 @@ public class Dictionary {
|
|||
if (!hasStemExceptions) {
|
||||
int morphStart = line.indexOf(MORPH_SEPARATOR);
|
||||
if (morphStart >= 0 && morphStart < line.length()) {
|
||||
hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
|
||||
hasStemExceptions = hasStemException(line.substring(morphStart + 1));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -963,6 +976,23 @@ public class Dictionary {
|
|||
writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
String toLowerCase(String word) {
|
||||
char[] chars = new char[word.length()];
|
||||
for (int i = 0; i < word.length(); i++) {
|
||||
chars[i] = caseFold(word.charAt(i));
|
||||
}
|
||||
return new String(chars);
|
||||
}
|
||||
|
||||
String toTitleCase(String word) {
|
||||
char[] chars = new char[word.length()];
|
||||
chars[0] = Character.toUpperCase(word.charAt(0));
|
||||
for (int i = 1; i < word.length(); i++) {
|
||||
chars[i] = caseFold(word.charAt(i));
|
||||
}
|
||||
return new String(chars);
|
||||
}
|
||||
|
||||
private String sortWordsOffline(
|
||||
Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
|
||||
OfflineSorter sorter =
|
||||
|
@ -1062,13 +1092,14 @@ public class Dictionary {
|
|||
}
|
||||
// we possibly have morphological data
|
||||
int stemExceptionID = 0;
|
||||
if (hasStemExceptions && end + 1 < line.length()) {
|
||||
String stemException = parseStemException(line.substring(end + 1));
|
||||
if (stemException != null) {
|
||||
stemExceptions = ArrayUtil.grow(stemExceptions, stemExceptionCount + 1);
|
||||
stemExceptionID =
|
||||
stemExceptionCount + 1; // we use '0' to indicate no exception for the form
|
||||
stemExceptions[stemExceptionCount++] = stemException;
|
||||
if (end + 1 < line.length()) {
|
||||
String morphData = line.substring(end + 1);
|
||||
for (String datum : splitMorphData(morphData)) {
|
||||
if (datum.startsWith("st:")) {
|
||||
stemExceptionID = addStemException(datum.substring(3));
|
||||
} else if (datum.startsWith("ph:") && datum.length() > 3) {
|
||||
addPhoneticRepEntries(entry, datum.substring(3));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1088,6 +1119,52 @@ public class Dictionary {
|
|||
}
|
||||
}
|
||||
|
||||
private int addStemException(String stemException) {
|
||||
stemExceptions = ArrayUtil.grow(stemExceptions, stemExceptionCount + 1);
|
||||
stemExceptions[stemExceptionCount++] = stemException;
|
||||
return stemExceptionCount; // we use '0' to indicate no exception for the form
|
||||
}
|
||||
|
||||
private void addPhoneticRepEntries(String word, String ph) {
|
||||
// e.g. "pretty ph:prity ph:priti->pretti" to suggest both prity->pretty and pritier->prettiest
|
||||
int arrow = ph.indexOf("->");
|
||||
String pattern;
|
||||
String replacement;
|
||||
if (arrow > 0) {
|
||||
pattern = ph.substring(0, arrow);
|
||||
replacement = ph.substring(arrow + 2);
|
||||
} else {
|
||||
pattern = ph;
|
||||
replacement = word;
|
||||
}
|
||||
|
||||
// when the ph: field ends with *, strip last character of pattern and replacement
|
||||
// e.g., "pretty ph:prity*" results in "prit->prett" replacement instead of "prity->pretty",
|
||||
// to get both prity->pretty and pritiest->prettiest suggestions.
|
||||
if (pattern.endsWith("*") && pattern.length() > 2 && replacement.length() > 1) {
|
||||
pattern = pattern.substring(0, pattern.length() - 2);
|
||||
replacement = replacement.substring(0, replacement.length() - 1);
|
||||
}
|
||||
|
||||
// capitalize lowercase pattern for capitalized words to support
|
||||
// good suggestions also for capitalized misspellings,
|
||||
// e.g. Wednesday ph:wendsay results in wendsay -> Wednesday and Wendsay -> Wednesday.
|
||||
if (WordCase.caseOf(word) == WordCase.TITLE && WordCase.caseOf(pattern) == WordCase.LOWER) {
|
||||
// add also lowercase word in the case of German or
|
||||
// Hungarian to support lowercase suggestions lowercased by
|
||||
// compound word generation or derivational suffixes
|
||||
// for example by adjectival suffix "-i" of geographical names in Hungarian:
|
||||
// Massachusetts ph:messzecsuzec
|
||||
// messzecsuzeci -> massachusettsi (adjective)
|
||||
// For lowercasing by conditional PFX rules, see e.g. germancompounding test
|
||||
if (hasLanguage("de", "hu")) {
|
||||
repTable.add(new RepEntry(pattern, toLowerCase(replacement)));
|
||||
}
|
||||
repTable.add(new RepEntry(toTitleCase(pattern), replacement));
|
||||
}
|
||||
repTable.add(new RepEntry(pattern, replacement));
|
||||
}
|
||||
|
||||
boolean isDotICaseChangeDisallowed(char[] word) {
|
||||
return word[0] == 'İ' && !alternateCasing;
|
||||
}
|
||||
|
@ -1220,29 +1297,31 @@ public class Dictionary {
|
|||
}
|
||||
}
|
||||
|
||||
private String parseStemException(String morphData) {
|
||||
private boolean hasStemException(String morphData) {
|
||||
for (String datum : splitMorphData(morphData)) {
|
||||
if (datum.startsWith("st:")) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private List<String> splitMorphData(String morphData) {
|
||||
// first see if it's an alias
|
||||
if (morphAliasCount > 0) {
|
||||
try {
|
||||
int alias = Integer.parseInt(morphData.trim());
|
||||
morphData = morphAliases[alias - 1];
|
||||
} catch (NumberFormatException e) {
|
||||
// fine
|
||||
} catch (NumberFormatException ignored) {
|
||||
}
|
||||
}
|
||||
// try to parse morph entry
|
||||
int index = morphData.indexOf(" st:");
|
||||
if (index < 0) {
|
||||
index = morphData.indexOf("\tst:");
|
||||
if (morphData.isBlank()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
if (index >= 0) {
|
||||
int endIndex = indexOfSpaceOrTab(morphData, index + 1);
|
||||
if (endIndex < 0) {
|
||||
endIndex = morphData.length();
|
||||
}
|
||||
return morphData.substring(index + 4, endIndex);
|
||||
}
|
||||
return null;
|
||||
return Arrays.stream(MORPH_KEY_PATTERN.split(morphData))
|
||||
.map(String::trim)
|
||||
.filter(s -> !s.isBlank())
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
boolean isForbiddenWord(char[] word, int length, BytesRef scratch) {
|
||||
|
|
|
@ -16,9 +16,12 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
class ModifyingSuggester {
|
||||
private static final int MAX_CHAR_DISTANCE = 4;
|
||||
|
@ -36,6 +39,14 @@ class ModifyingSuggester {
|
|||
|
||||
WordCase wc = WordCase.caseOf(word);
|
||||
|
||||
if (wc == WordCase.UPPER) {
|
||||
tryVariationsOf(speller.dictionary.toLowerCase(word));
|
||||
tryVariationsOf(speller.dictionary.toTitleCase(word));
|
||||
return result.stream()
|
||||
.map(this::tryUpperCase)
|
||||
.collect(Collectors.toCollection(LinkedHashSet::new));
|
||||
}
|
||||
|
||||
if (wc == WordCase.MIXED) {
|
||||
int dot = word.indexOf('.');
|
||||
if (dot > 0
|
||||
|
@ -44,27 +55,24 @@ class ModifyingSuggester {
|
|||
result.add(word.substring(0, dot + 1) + " " + word.substring(dot + 1));
|
||||
}
|
||||
|
||||
tryVariationsOf(toLowerCase(word));
|
||||
tryVariationsOf(speller.dictionary.toLowerCase(word));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private String toLowerCase(String word) {
|
||||
char[] chars = new char[word.length()];
|
||||
for (int i = 0; i < word.length(); i++) {
|
||||
chars[i] = speller.dictionary.caseFold(word.charAt(i));
|
||||
private String tryUpperCase(String candidate) {
|
||||
String upper = candidate.toUpperCase(Locale.ROOT);
|
||||
if (upper.contains(" ") || speller.spell(upper)) {
|
||||
return upper;
|
||||
}
|
||||
return new String(chars);
|
||||
String title = speller.dictionary.toTitleCase(candidate);
|
||||
return speller.spell(title) ? title : candidate;
|
||||
}
|
||||
|
||||
private void tryVariationsOf(String word) {
|
||||
trySuggestion(word.toUpperCase(Locale.ROOT));
|
||||
if (checkDictionaryForSplitSuggestions(word)) {
|
||||
return;
|
||||
}
|
||||
|
||||
tryRep(word);
|
||||
boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
|
||||
hasGoodSuggestions |= tryRep(word);
|
||||
|
||||
trySwappingChars(word);
|
||||
tryLongSwap(word);
|
||||
|
@ -75,12 +83,24 @@ class ModifyingSuggester {
|
|||
tryReplacingChar(word);
|
||||
tryTwoDuplicateChars(word);
|
||||
|
||||
if (speller.dictionary.enableSplitSuggestions) {
|
||||
List<String> goodSplit = checkDictionaryForSplitSuggestions(word);
|
||||
if (!goodSplit.isEmpty()) {
|
||||
List<String> copy = new ArrayList<>(result);
|
||||
result.clear();
|
||||
result.addAll(goodSplit);
|
||||
if (hasGoodSuggestions) {
|
||||
result.addAll(copy);
|
||||
}
|
||||
hasGoodSuggestions = true;
|
||||
}
|
||||
|
||||
if (!hasGoodSuggestions && speller.dictionary.enableSplitSuggestions) {
|
||||
trySplitting(word);
|
||||
}
|
||||
}
|
||||
|
||||
private void tryRep(String word) {
|
||||
private boolean tryRep(String word) {
|
||||
int before = result.size();
|
||||
for (RepEntry entry : speller.dictionary.repTable) {
|
||||
for (String candidate : entry.substitute(word)) {
|
||||
if (trySuggestion(candidate)) {
|
||||
|
@ -88,11 +108,16 @@ class ModifyingSuggester {
|
|||
}
|
||||
|
||||
if (candidate.contains(" ")
|
||||
&& Arrays.stream(candidate.split(" ")).allMatch(speller::checkWord)) {
|
||||
&& Arrays.stream(candidate.split(" ")).allMatch(this::checkSimpleWord)) {
|
||||
result.add(candidate);
|
||||
}
|
||||
}
|
||||
}
|
||||
return result.size() > before;
|
||||
}
|
||||
|
||||
private boolean checkSimpleWord(String part) {
|
||||
return Boolean.TRUE.equals(speller.checkSimpleWord(part.toCharArray(), part.length(), null));
|
||||
}
|
||||
|
||||
private void trySwappingChars(String word) {
|
||||
|
@ -213,24 +238,30 @@ class ModifyingSuggester {
|
|||
}
|
||||
}
|
||||
|
||||
private boolean checkDictionaryForSplitSuggestions(String word) {
|
||||
boolean found = false;
|
||||
private List<String> checkDictionaryForSplitSuggestions(String word) {
|
||||
List<String> result = new ArrayList<>();
|
||||
for (int i = 1; i < word.length() - 1; i++) {
|
||||
String w1 = word.substring(0, i);
|
||||
String w2 = word.substring(i);
|
||||
found |= trySuggestion(w1 + " " + w2);
|
||||
String spaced = w1 + " " + w2;
|
||||
if (speller.checkWord(spaced)) {
|
||||
result.add(spaced);
|
||||
}
|
||||
if (shouldSplitByDash()) {
|
||||
found |= trySuggestion(w1 + "-" + w2);
|
||||
String dashed = w1 + "-" + w2;
|
||||
if (speller.checkWord(dashed)) {
|
||||
result.add(dashed);
|
||||
}
|
||||
}
|
||||
}
|
||||
return found;
|
||||
return result;
|
||||
}
|
||||
|
||||
private void trySplitting(String word) {
|
||||
for (int i = 1; i < word.length() - 1; i++) {
|
||||
String w1 = word.substring(0, i);
|
||||
String w2 = word.substring(i);
|
||||
if (speller.checkWord(w1) && speller.checkWord(w2)) {
|
||||
if (checkSimpleWord(w1) && checkSimpleWord(w2)) {
|
||||
result.add(w1 + " " + w2);
|
||||
if (shouldSplitByDash()) {
|
||||
result.add(w1 + "-" + w2);
|
||||
|
@ -244,10 +275,6 @@ class ModifyingSuggester {
|
|||
}
|
||||
|
||||
private boolean trySuggestion(String candidate) {
|
||||
if (speller.checkWord(candidate)) {
|
||||
result.add(candidate);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
return speller.checkWord(candidate) && result.add(candidate);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -134,7 +134,7 @@ public class SpellChecker {
|
|||
return checkWord(word.toCharArray(), word.length(), null);
|
||||
}
|
||||
|
||||
private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
|
||||
Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) {
|
||||
if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -143,6 +143,15 @@ public class SpellChecker {
|
|||
return true;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
|
||||
Boolean simpleResult = checkSimpleWord(wordChars, length, originalCase);
|
||||
if (simpleResult != null) {
|
||||
return simpleResult;
|
||||
}
|
||||
|
||||
if (dictionary.compoundRules != null
|
||||
&& checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
|
||||
return true;
|
||||
|
|
|
@ -44,10 +44,18 @@ public class SpellCheckerTest extends StemmerTestBase {
|
|||
doTest("allcaps");
|
||||
}
|
||||
|
||||
public void rep() throws Exception {
|
||||
public void testRepSuggestions() throws Exception {
|
||||
doTest("rep");
|
||||
}
|
||||
|
||||
public void testPhSuggestions() throws Exception {
|
||||
doTest("ph");
|
||||
}
|
||||
|
||||
public void testPhSuggestions2() throws Exception {
|
||||
doTest("ph2");
|
||||
}
|
||||
|
||||
public void testForceUCase() throws Exception {
|
||||
doTest("forceucase");
|
||||
}
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
# new suggestion methods of Hunspell 1.7:
|
||||
# ph: for dictionary-based suggestions.
|
||||
#
|
||||
# For example, suggestions for "wich"
|
||||
# with this test dictonary:
|
||||
#
|
||||
# Hunspell 1.3.3
|
||||
# wich
|
||||
# & wich 4 0: winch, witch, which, wish
|
||||
#
|
||||
# Hunspell 1.6.2
|
||||
# wich
|
||||
# & wich 4 0: which, witch, winch, wish
|
||||
#
|
||||
# Suggestions will be limited for
|
||||
# the dictionary words with the same ph: field,
|
||||
# and for non-ngram suggestions.
|
||||
#
|
||||
# Order of the ph: suggestions for the
|
||||
# same mispelling, eg. wich -> which, witch
|
||||
# follows the order of the words in the dictionary:
|
||||
#
|
||||
# which ph:wich
|
||||
# witch ph:witch
|
||||
#
|
||||
# switch off ngram suggestions to check only
|
||||
# ph: based suggestions
|
||||
MAXNGRAMSUGS 0
|
||||
|
||||
TRY esianrtolcdugmphbyfvkwzESIANRTOLCDUGMPHBYFVKWZ'-
|
|
@ -0,0 +1,11 @@
|
|||
8
|
||||
a lot ph:alot
|
||||
in spite ph:inspite
|
||||
inspire
|
||||
what ph:whta ph:waht
|
||||
Wednesday ph:wendsay ph:wensday
|
||||
which ph:wich
|
||||
witch ph:wich
|
||||
winch
|
||||
wish
|
||||
Oh, my gosh! ph:omg
|
|
@ -0,0 +1,11 @@
|
|||
a lot
|
||||
in spite, inspire
|
||||
what
|
||||
what
|
||||
Wednesday
|
||||
Wednesday
|
||||
Wednesday
|
||||
Wednesday
|
||||
which, witch, winch, wish
|
||||
Oh, my gosh!
|
||||
OH, MY GOSH!
|
|
@ -0,0 +1,11 @@
|
|||
alot
|
||||
inspite
|
||||
whta
|
||||
waht
|
||||
wensday
|
||||
wendsay
|
||||
Wensday
|
||||
Wendsay
|
||||
wich
|
||||
omg
|
||||
OMG
|
|
@ -0,0 +1,32 @@
|
|||
# switch off ngram suggestion to test
|
||||
# usage of ph: dictionary fields in REP
|
||||
# suggestions
|
||||
SET UTF-8
|
||||
|
||||
MAXNGRAMSUGS 0
|
||||
|
||||
# test in compounds, too
|
||||
COMPOUNDFLAG Y
|
||||
|
||||
# test also dictionary items with space,
|
||||
# and forbidden compounding, if there is
|
||||
# a ph: field with that compound as
|
||||
# mispelling in the dictionary
|
||||
CHECKCOMPOUNDREP
|
||||
|
||||
# test in compound word with affixes
|
||||
SFX A Y 1
|
||||
SFX A 0 's .
|
||||
|
||||
# when the ph: field ends with the character *,
|
||||
# strip last character of the pattern and the replacement
|
||||
# to match in REP suggestions also at character changes,
|
||||
# for example, "pretty ph:prity*" results "prit->prett"
|
||||
# REP replacement instead of "prity->pretty", to get
|
||||
# prity->pretty and pritiest->prettiest suggestions.
|
||||
|
||||
SFX B Y 2
|
||||
SFX B y iest [^aeiou]y
|
||||
SFX B ö őt ö
|
||||
|
||||
WORDCHARS '
|
|
@ -0,0 +1,11 @@
|
|||
9
|
||||
foo ph:bar ph:baz
|
||||
foo bar ph:foobar
|
||||
word/Y ph:baz
|
||||
stem/Y ph: ph:
|
||||
forbidden/Y
|
||||
root/YA
|
||||
forbidden root/A ph:forbiddenroot
|
||||
pretty/B ph:prity*
|
||||
foobarö/B ph:fubarő*
|
||||
happy/B ph:hepy ph:hepi->happi
|
|
@ -0,0 +1,9 @@
|
|||
foo
|
||||
word
|
||||
stem
|
||||
wordstem
|
||||
stemword
|
||||
rootforbidden
|
||||
root's
|
||||
foobarö
|
||||
foobarőt
|
|
@ -0,0 +1,14 @@
|
|||
foo
|
||||
foo, word
|
||||
foo bar
|
||||
wordstem
|
||||
stemword
|
||||
stemwordstem
|
||||
forbidden root
|
||||
forbidden root's
|
||||
pretty
|
||||
prettiest
|
||||
foobarö
|
||||
foobarőt
|
||||
happy
|
||||
happiest
|
|
@ -0,0 +1,15 @@
|
|||
bar
|
||||
baz
|
||||
foobar
|
||||
bazstem
|
||||
stembaz
|
||||
stembazstem
|
||||
forbiddenroot
|
||||
forbiddenroot's
|
||||
rootforbiddenroot
|
||||
prity
|
||||
pritiest
|
||||
fubarö
|
||||
fubarőt
|
||||
hepy
|
||||
hepiest
|
|
@ -10,7 +10,7 @@ REP shun$ tion
|
|||
REP ^alot$ a_lot # add the highest priority for "a lot" suggestion to "alot"
|
||||
REP ^foo$ bar
|
||||
REP ' _ # "un'alunno" -> "un alunno"
|
||||
REP ^vinte<EFBFBD>n$ vinte_e_un
|
||||
REP ^vinteún$ vinte_e_un
|
||||
REP s 's
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue