LUCENE-9703: Hunspell: prohibit FORBIDDENWORD words and their case variations (#2254)

2021-01-29 08:36:37 +01:00 · 2021-01-29 08:36:37 +01:00 · 71705c900b
parent 4ba78f2ab2
commit 71705c900b
5 changed files with 17 additions and 3 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@ -57,6 +57,10 @@ public class SpellChecker {
    }
    char[] wordChars = word.toCharArray();
    if (dictionary.isForbiddenWord(wordChars, wordChars.length, scratch)) {
      return false;
    }
    if (checkWord(wordChars, wordChars.length, false)) {
      return true;
    }
@ -66,9 +70,7 @@ public class SpellChecker {
      return true;
    }
-    if (dictionary.breaks.isNotEmpty()
+    if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
        && !hasTooManyBreakOccurrences(word)
        && !dictionary.isForbiddenWord(wordChars, word.length(), scratch)) {
      return tryBreaks(word);
    }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.hunspell;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.util.ArrayUtil;
@ -93,6 +94,10 @@ final class Stemmer {
      word = scratchBuffer;
    }
    if (dictionary.isForbiddenWord(word, length, scratch)) {
      return Collections.emptyList();
    }
    WordCase wordCase = caseOf(word, length);
    List<CharsRef> list = doStem(word, 0, length, false, WordContext.SIMPLE_WORD);
    if (wordCase == WordCase.UPPER) {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/IJ.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/IJ.wrong
@ -0,0 +1 @@
 Ijs
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@ -41,6 +41,11 @@ public class SpellCheckerTest extends StemmerTestBase {
    doTest("allcaps");
  }
  @Test
  public void IJ() throws Exception {
    doTest("IJ");
  }
  @Test
  public void i53643_numbersWithSeparators() throws Exception {
    doTest("i53643");
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDutchIJ.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDutchIJ.java
@ -27,5 +27,6 @@ public class TestDutchIJ extends StemmerTestBase {
  public void testStemming() {
    assertStemsTo("ijs", "ijs");
    assertStemsTo("IJs", "ijs");
    assertStemsTo("Ijs");
  }
 }