LUCENE-9703: Hunspell: prohibit FORBIDDENWORD words and their case variations (#2254)

This commit is contained in:
Peter Gromov 2021-01-29 08:36:37 +01:00 committed by GitHub
parent 4ba78f2ab2
commit 71705c900b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 17 additions and 3 deletions

View File

@ -57,6 +57,10 @@ public class SpellChecker {
} }
char[] wordChars = word.toCharArray(); char[] wordChars = word.toCharArray();
if (dictionary.isForbiddenWord(wordChars, wordChars.length, scratch)) {
return false;
}
if (checkWord(wordChars, wordChars.length, false)) { if (checkWord(wordChars, wordChars.length, false)) {
return true; return true;
} }
@ -66,9 +70,7 @@ public class SpellChecker {
return true; return true;
} }
if (dictionary.breaks.isNotEmpty() if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
&& !hasTooManyBreakOccurrences(word)
&& !dictionary.isForbiddenWord(wordChars, word.length(), scratch)) {
return tryBreaks(word); return tryBreaks(word);
} }

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.hunspell;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
@ -93,6 +94,10 @@ final class Stemmer {
word = scratchBuffer; word = scratchBuffer;
} }
if (dictionary.isForbiddenWord(word, length, scratch)) {
return Collections.emptyList();
}
WordCase wordCase = caseOf(word, length); WordCase wordCase = caseOf(word, length);
List<CharsRef> list = doStem(word, 0, length, false, WordContext.SIMPLE_WORD); List<CharsRef> list = doStem(word, 0, length, false, WordContext.SIMPLE_WORD);
if (wordCase == WordCase.UPPER) { if (wordCase == WordCase.UPPER) {

View File

@ -41,6 +41,11 @@ public class SpellCheckerTest extends StemmerTestBase {
doTest("allcaps"); doTest("allcaps");
} }
@Test
public void IJ() throws Exception {
doTest("IJ");
}
@Test @Test
public void i53643_numbersWithSeparators() throws Exception { public void i53643_numbersWithSeparators() throws Exception {
doTest("i53643"); doTest("i53643");

View File

@ -27,5 +27,6 @@ public class TestDutchIJ extends StemmerTestBase {
public void testStemming() { public void testStemming() {
assertStemsTo("ijs", "ijs"); assertStemsTo("ijs", "ijs");
assertStemsTo("IJs", "ijs"); assertStemsTo("IJs", "ijs");
assertStemsTo("Ijs");
} }
} }