LUCENE-9703: Hunspell: prohibit FORBIDDENWORD words and their case variations (#2254)

This commit is contained in:
Peter Gromov 2021-01-29 08:36:37 +01:00 committed by GitHub
parent 4ba78f2ab2
commit 71705c900b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 17 additions and 3 deletions

View File

@ -57,6 +57,10 @@ public class SpellChecker {
}
char[] wordChars = word.toCharArray();
if (dictionary.isForbiddenWord(wordChars, wordChars.length, scratch)) {
return false;
}
if (checkWord(wordChars, wordChars.length, false)) {
return true;
}
@ -66,9 +70,7 @@ public class SpellChecker {
return true;
}
if (dictionary.breaks.isNotEmpty()
&& !hasTooManyBreakOccurrences(word)
&& !dictionary.isForbiddenWord(wordChars, word.length(), scratch)) {
if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
return tryBreaks(word);
}

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.hunspell;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.ArrayUtil;
@ -93,6 +94,10 @@ final class Stemmer {
word = scratchBuffer;
}
if (dictionary.isForbiddenWord(word, length, scratch)) {
return Collections.emptyList();
}
WordCase wordCase = caseOf(word, length);
List<CharsRef> list = doStem(word, 0, length, false, WordContext.SIMPLE_WORD);
if (wordCase == WordCase.UPPER) {

View File

@ -41,6 +41,11 @@ public class SpellCheckerTest extends StemmerTestBase {
doTest("allcaps");
}
@Test
public void IJ() throws Exception {
doTest("IJ");
}
@Test
public void i53643_numbersWithSeparators() throws Exception {
doTest("i53643");

View File

@ -27,5 +27,6 @@ public class TestDutchIJ extends StemmerTestBase {
public void testStemming() {
assertStemsTo("ijs", "ijs");
assertStemsTo("IJs", "ijs");
assertStemsTo("Ijs");
}
}