From 71705c900b6002ef36d1448f64a78526c158bf70 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Fri, 29 Jan 2021 08:36:37 +0100 Subject: [PATCH] LUCENE-9703: Hunspell: prohibit FORBIDDENWORD words and their case variations (#2254) --- .../org/apache/lucene/analysis/hunspell/SpellChecker.java | 8 +++++--- .../java/org/apache/lucene/analysis/hunspell/Stemmer.java | 5 +++++ .../src/test/org/apache/lucene/analysis/hunspell/IJ.wrong | 1 + .../apache/lucene/analysis/hunspell/SpellCheckerTest.java | 5 +++++ .../org/apache/lucene/analysis/hunspell/TestDutchIJ.java | 1 + 5 files changed, 17 insertions(+), 3 deletions(-) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/IJ.wrong diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java index e32a805be14..8570e3d89fc 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java @@ -57,6 +57,10 @@ public class SpellChecker { } char[] wordChars = word.toCharArray(); + if (dictionary.isForbiddenWord(wordChars, wordChars.length, scratch)) { + return false; + } + if (checkWord(wordChars, wordChars.length, false)) { return true; } @@ -66,9 +70,7 @@ public class SpellChecker { return true; } - if (dictionary.breaks.isNotEmpty() - && !hasTooManyBreakOccurrences(word) - && !dictionary.isForbiddenWord(wordChars, word.length(), scratch)) { + if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) { return tryBreaks(word); } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index 572473c2ab7..d88ee408f35 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -18,6 +18,7 @@ package org.apache.lucene.analysis.hunspell; import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.util.ArrayUtil; @@ -93,6 +94,10 @@ final class Stemmer { word = scratchBuffer; } + if (dictionary.isForbiddenWord(word, length, scratch)) { + return Collections.emptyList(); + } + WordCase wordCase = caseOf(word, length); List list = doStem(word, 0, length, false, WordContext.SIMPLE_WORD); if (wordCase == WordCase.UPPER) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/IJ.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/IJ.wrong new file mode 100644 index 00000000000..54bbb475a07 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/IJ.wrong @@ -0,0 +1 @@ +Ijs diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java index dbfbbec08c8..0baf32f2901 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java @@ -41,6 +41,11 @@ public class SpellCheckerTest extends StemmerTestBase { doTest("allcaps"); } + @Test + public void IJ() throws Exception { + doTest("IJ"); + } + @Test public void i53643_numbersWithSeparators() throws Exception { doTest("i53643"); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDutchIJ.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDutchIJ.java index dc4b897dae3..58477d84f12 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDutchIJ.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDutchIJ.java @@ -27,5 +27,6 @@ public class TestDutchIJ extends StemmerTestBase { public void testStemming() { assertStemsTo("ijs", "ijs"); assertStemsTo("IJs", "ijs"); + assertStemsTo("Ijs"); } }