mirror of https://github.com/apache/lucene.git
LUCENE-9758: Hunspell: support NOSUGGEST option (#2340)
This commit is contained in:
parent
76b55509dd
commit
7f9b1f991b
|
@ -175,6 +175,7 @@ public class Dictionary {
|
|||
int maxDiff = 5;
|
||||
int maxNGramSuggestions = Integer.MAX_VALUE;
|
||||
boolean onlyMaxDiff;
|
||||
char noSuggest, subStandard;
|
||||
|
||||
// FSTs used for ICONV/OCONV, output ord pointing to replacement text
|
||||
FST<CharsRef> iconv;
|
||||
|
@ -431,6 +432,10 @@ public class Dictionary {
|
|||
onlyMaxDiff = true;
|
||||
} else if ("FORBIDDENWORD".equals(firstWord)) {
|
||||
forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("NOSUGGEST".equals(firstWord)) {
|
||||
noSuggest = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("SUBSTANDARD".equals(firstWord)) {
|
||||
subStandard = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("COMPOUNDMIN".equals(firstWord)) {
|
||||
compoundMin = Math.max(1, parseNum(reader, line));
|
||||
} else if ("COMPOUNDWORDMAX".equals(firstWord)) {
|
||||
|
|
|
@ -111,6 +111,7 @@ class GeneratingSuggester {
|
|||
for (int i = 0; i < forms.length; i += dictionary.formStep()) {
|
||||
int entryId = forms.ints[forms.offset + i];
|
||||
if (dictionary.hasFlag(entryId, dictionary.forbiddenword)
|
||||
|| dictionary.hasFlag(entryId, dictionary.noSuggest)
|
||||
|| dictionary.hasFlag(entryId, Dictionary.HIDDEN_FLAG)
|
||||
|| dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
|
||||
continue;
|
||||
|
|
|
@ -143,13 +143,19 @@ public class SpellChecker {
|
|||
length,
|
||||
originalCase,
|
||||
context,
|
||||
(stem, forms, formID) -> {
|
||||
(stem, formID, stemException) -> {
|
||||
if (acceptsStem(formID)) {
|
||||
result[0] = stem;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
return result[0];
|
||||
}
|
||||
|
||||
boolean acceptsStem(int formID) {
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean checkCompounds(CharsRef word, WordCase originalCase, CompoundPart prev) {
|
||||
if (prev != null && prev.index > dictionary.compoundMax - 2) return false;
|
||||
|
||||
|
@ -424,12 +430,20 @@ public class SpellChecker {
|
|||
}
|
||||
|
||||
WordCase wordCase = WordCase.caseOf(word);
|
||||
ModifyingSuggester modifier = new ModifyingSuggester(this);
|
||||
SpellChecker suggestionSpeller =
|
||||
new SpellChecker(dictionary) {
|
||||
@Override
|
||||
boolean acceptsStem(int formID) {
|
||||
return !dictionary.hasFlag(formID, dictionary.noSuggest)
|
||||
&& !dictionary.hasFlag(formID, dictionary.subStandard);
|
||||
}
|
||||
};
|
||||
ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller);
|
||||
Set<String> suggestions = modifier.suggest(word, wordCase);
|
||||
|
||||
if (!modifier.hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
|
||||
suggestions.addAll(
|
||||
new GeneratingSuggester(this)
|
||||
new GeneratingSuggester(suggestionSpeller)
|
||||
.suggest(dictionary.toLowerCase(word), wordCase, suggestions));
|
||||
}
|
||||
|
||||
|
|
|
@ -100,8 +100,8 @@ final class Stemmer {
|
|||
|
||||
List<CharsRef> list = new ArrayList<>();
|
||||
RootProcessor processor =
|
||||
(stem, forms, formID) -> {
|
||||
list.add(newStem(stem, forms, formID));
|
||||
(stem, formID, stemException) -> {
|
||||
list.add(newStem(stem, stemException));
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@ -273,7 +273,7 @@ final class Stemmer {
|
|||
continue;
|
||||
}
|
||||
}
|
||||
if (!processor.processRoot(new CharsRef(word, offset, length), forms, i)) {
|
||||
if (!callProcessor(word, offset, length, processor, forms, i)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -344,23 +344,27 @@ final class Stemmer {
|
|||
}
|
||||
|
||||
interface RootProcessor {
|
||||
/** @return whether the processing should be continued */
|
||||
boolean processRoot(CharsRef stem, IntsRef forms, int formID);
|
||||
/**
|
||||
* @param stem the text of the found dictionary entry
|
||||
* @param formID internal id of the dictionary entry, e.g. to be used in {@link
|
||||
* Dictionary#hasFlag(int, char)}
|
||||
* @param stemException "st:" morphological data if present, {@code null} otherwise
|
||||
* @return whether the processing should be continued
|
||||
*/
|
||||
boolean processRoot(CharsRef stem, int formID, String stemException);
|
||||
}
|
||||
|
||||
private CharsRef newStem(CharsRef stem, IntsRef forms, int formID) {
|
||||
final String exception;
|
||||
private String stemException(IntsRef forms, int formIndex) {
|
||||
if (dictionary.hasStemExceptions) {
|
||||
int exceptionID = forms.ints[forms.offset + formID + 1];
|
||||
int exceptionID = forms.ints[forms.offset + formIndex + 1];
|
||||
if (exceptionID > 0) {
|
||||
exception = dictionary.getStemException(exceptionID);
|
||||
} else {
|
||||
exception = null;
|
||||
return dictionary.getStemException(exceptionID);
|
||||
}
|
||||
} else {
|
||||
exception = null;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private CharsRef newStem(CharsRef stem, String exception) {
|
||||
if (dictionary.needsOutputCleaning) {
|
||||
scratchSegment.setLength(0);
|
||||
if (exception != null) {
|
||||
|
@ -704,7 +708,7 @@ final class Stemmer {
|
|||
continue;
|
||||
}
|
||||
}
|
||||
if (!processor.processRoot(new CharsRef(strippedWord, offset, length), forms, i)) {
|
||||
if (!callProcessor(strippedWord, offset, length, processor, forms, i)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -757,6 +761,12 @@ final class Stemmer {
|
|||
return true;
|
||||
}
|
||||
|
||||
private boolean callProcessor(
|
||||
char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
|
||||
CharsRef stem = new CharsRef(word, offset, length);
|
||||
return processor.processRoot(stem, forms.ints[forms.offset + i], stemException(forms, i));
|
||||
}
|
||||
|
||||
private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix, int prefixId) {
|
||||
char circumfix = dictionary.circumfix;
|
||||
// if circumfix was previously set by a prefix, we must check this suffix,
|
||||
|
@ -765,7 +775,6 @@ final class Stemmer {
|
|||
&& isFlagAppendedByAffix(prefixId, circumfix) != isFlagAppendedByAffix(affix, circumfix)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isFlagAppendedByAffix(affix, dictionary.needaffix)) {
|
||||
return !isSuffix
|
||||
|| previousAffix < 0
|
||||
|
|
|
@ -196,6 +196,10 @@ public class SpellCheckerTest extends StemmerTestBase {
|
|||
doTest("map");
|
||||
}
|
||||
|
||||
public void testNoSuggest() throws Exception {
|
||||
doTest("nosuggest");
|
||||
}
|
||||
|
||||
protected void doTest(String name) throws Exception {
|
||||
checkSpellCheckerExpectations(
|
||||
Path.of(getClass().getResource(name + ".aff").toURI()).getParent().resolve(name));
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
# don't suggest word with NOSUGGEST flag (for example vulgar or obscene words)
|
||||
# See OpenOffice.org Issue #55498
|
||||
# (nosuggest.sug is an empty file)
|
||||
NOSUGGEST A
|
||||
COMPOUNDFLAG B
|
|
@ -0,0 +1,3 @@
|
|||
1
|
||||
foo/AB
|
||||
bar/B
|
|
@ -0,0 +1,3 @@
|
|||
foo
|
||||
foobar
|
||||
barfoo
|
|
@ -0,0 +1,3 @@
|
|||
foox
|
||||
foobarx
|
||||
barfoox
|
Loading…
Reference in New Issue