LUCENE-9758: Hunspell: support NOSUGGEST option (#2340)

This commit is contained in:
Peter Gromov 2021-02-11 09:15:37 +01:00 committed by GitHub
parent 76b55509dd
commit 7f9b1f991b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 66 additions and 19 deletions

View File

@ -175,6 +175,7 @@ public class Dictionary {
int maxDiff = 5;
int maxNGramSuggestions = Integer.MAX_VALUE;
boolean onlyMaxDiff;
char noSuggest, subStandard;
// FSTs used for ICONV/OCONV, output ord pointing to replacement text
FST<CharsRef> iconv;
@ -431,6 +432,10 @@ public class Dictionary {
onlyMaxDiff = true;
} else if ("FORBIDDENWORD".equals(firstWord)) {
forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("NOSUGGEST".equals(firstWord)) {
noSuggest = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("SUBSTANDARD".equals(firstWord)) {
subStandard = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("COMPOUNDMIN".equals(firstWord)) {
compoundMin = Math.max(1, parseNum(reader, line));
} else if ("COMPOUNDWORDMAX".equals(firstWord)) {

View File

@ -111,6 +111,7 @@ class GeneratingSuggester {
for (int i = 0; i < forms.length; i += dictionary.formStep()) {
int entryId = forms.ints[forms.offset + i];
if (dictionary.hasFlag(entryId, dictionary.forbiddenword)
|| dictionary.hasFlag(entryId, dictionary.noSuggest)
|| dictionary.hasFlag(entryId, Dictionary.HIDDEN_FLAG)
|| dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
continue;

View File

@ -143,13 +143,19 @@ public class SpellChecker {
length,
originalCase,
context,
(stem, forms, formID) -> {
result[0] = stem;
(stem, formID, stemException) -> {
if (acceptsStem(formID)) {
result[0] = stem;
}
return false;
});
return result[0];
}
boolean acceptsStem(int formID) {
return true;
}
private boolean checkCompounds(CharsRef word, WordCase originalCase, CompoundPart prev) {
if (prev != null && prev.index > dictionary.compoundMax - 2) return false;
@ -424,12 +430,20 @@ public class SpellChecker {
}
WordCase wordCase = WordCase.caseOf(word);
ModifyingSuggester modifier = new ModifyingSuggester(this);
SpellChecker suggestionSpeller =
new SpellChecker(dictionary) {
@Override
boolean acceptsStem(int formID) {
return !dictionary.hasFlag(formID, dictionary.noSuggest)
&& !dictionary.hasFlag(formID, dictionary.subStandard);
}
};
ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller);
Set<String> suggestions = modifier.suggest(word, wordCase);
if (!modifier.hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
suggestions.addAll(
new GeneratingSuggester(this)
new GeneratingSuggester(suggestionSpeller)
.suggest(dictionary.toLowerCase(word), wordCase, suggestions));
}

View File

@ -100,8 +100,8 @@ final class Stemmer {
List<CharsRef> list = new ArrayList<>();
RootProcessor processor =
(stem, forms, formID) -> {
list.add(newStem(stem, forms, formID));
(stem, formID, stemException) -> {
list.add(newStem(stem, stemException));
return true;
};
@ -273,7 +273,7 @@ final class Stemmer {
continue;
}
}
if (!processor.processRoot(new CharsRef(word, offset, length), forms, i)) {
if (!callProcessor(word, offset, length, processor, forms, i)) {
return false;
}
}
@ -344,23 +344,27 @@ final class Stemmer {
}
interface RootProcessor {
/** @return whether the processing should be continued */
boolean processRoot(CharsRef stem, IntsRef forms, int formID);
/**
* @param stem the text of the found dictionary entry
* @param formID internal id of the dictionary entry, e.g. to be used in {@link
* Dictionary#hasFlag(int, char)}
* @param stemException "st:" morphological data if present, {@code null} otherwise
* @return whether the processing should be continued
*/
boolean processRoot(CharsRef stem, int formID, String stemException);
}
private CharsRef newStem(CharsRef stem, IntsRef forms, int formID) {
final String exception;
private String stemException(IntsRef forms, int formIndex) {
if (dictionary.hasStemExceptions) {
int exceptionID = forms.ints[forms.offset + formID + 1];
int exceptionID = forms.ints[forms.offset + formIndex + 1];
if (exceptionID > 0) {
exception = dictionary.getStemException(exceptionID);
} else {
exception = null;
return dictionary.getStemException(exceptionID);
}
} else {
exception = null;
}
return null;
}
private CharsRef newStem(CharsRef stem, String exception) {
if (dictionary.needsOutputCleaning) {
scratchSegment.setLength(0);
if (exception != null) {
@ -704,7 +708,7 @@ final class Stemmer {
continue;
}
}
if (!processor.processRoot(new CharsRef(strippedWord, offset, length), forms, i)) {
if (!callProcessor(strippedWord, offset, length, processor, forms, i)) {
return false;
}
}
@ -757,6 +761,12 @@ final class Stemmer {
return true;
}
private boolean callProcessor(
char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
CharsRef stem = new CharsRef(word, offset, length);
return processor.processRoot(stem, forms.ints[forms.offset + i], stemException(forms, i));
}
private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix, int prefixId) {
char circumfix = dictionary.circumfix;
// if circumfix was previously set by a prefix, we must check this suffix,
@ -765,7 +775,6 @@ final class Stemmer {
&& isFlagAppendedByAffix(prefixId, circumfix) != isFlagAppendedByAffix(affix, circumfix)) {
return true;
}
if (isFlagAppendedByAffix(affix, dictionary.needaffix)) {
return !isSuffix
|| previousAffix < 0

View File

@ -196,6 +196,10 @@ public class SpellCheckerTest extends StemmerTestBase {
doTest("map");
}
public void testNoSuggest() throws Exception {
doTest("nosuggest");
}
protected void doTest(String name) throws Exception {
checkSpellCheckerExpectations(
Path.of(getClass().getResource(name + ".aff").toURI()).getParent().resolve(name));

View File

@ -0,0 +1,5 @@
# don't suggest word with NOSUGGEST flag (for example vulgar or obscene words)
# See OpenOffice.org Issue #55498
# (nosuggest.sug is an empty file)
NOSUGGEST A
COMPOUNDFLAG B

View File

@ -0,0 +1,3 @@
1
foo/AB
bar/B

View File

@ -0,0 +1,3 @@
foo
foobar
barfoo

View File

@ -0,0 +1,3 @@
foox
foobarx
barfoox