LUCENE-9758: Hunspell: support NOSUGGEST option (#2340)

2021-02-11 09:15:37 +01:00 · 2021-02-11 09:15:37 +01:00 · 7f9b1f991b
parent 76b55509dd
commit 7f9b1f991b
10 changed files with 66 additions and 19 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -175,6 +175,7 @@ public class Dictionary {
  int maxDiff = 5;
  int maxNGramSuggestions = Integer.MAX_VALUE;
  boolean onlyMaxDiff;
+  char noSuggest, subStandard;

  // FSTs used for ICONV/OCONV, output ord pointing to replacement text
  FST<CharsRef> iconv;
@ -431,6 +432,10 @@ public class Dictionary {
        onlyMaxDiff = true;
      } else if ("FORBIDDENWORD".equals(firstWord)) {
        forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
+      } else if ("NOSUGGEST".equals(firstWord)) {
+        noSuggest = flagParsingStrategy.parseFlag(singleArgument(reader, line));
+      } else if ("SUBSTANDARD".equals(firstWord)) {
+        subStandard = flagParsingStrategy.parseFlag(singleArgument(reader, line));
      } else if ("COMPOUNDMIN".equals(firstWord)) {
        compoundMin = Math.max(1, parseNum(reader, line));
      } else if ("COMPOUNDWORDMAX".equals(firstWord)) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@ -111,6 +111,7 @@ class GeneratingSuggester {
    for (int i = 0; i < forms.length; i += dictionary.formStep()) {
      int entryId = forms.ints[forms.offset + i];
      if (dictionary.hasFlag(entryId, dictionary.forbiddenword)
+          || dictionary.hasFlag(entryId, dictionary.noSuggest)
          || dictionary.hasFlag(entryId, Dictionary.HIDDEN_FLAG)
          || dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
        continue;
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@ -143,13 +143,19 @@ public class SpellChecker {
        length,
        originalCase,
        context,
-        (stem, forms, formID) -> {
+        (stem, formID, stemException) -> {
+          if (acceptsStem(formID)) {
            result[0] = stem;
+          }
          return false;
        });
    return result[0];
  }

+  boolean acceptsStem(int formID) {
+    return true;
+  }
+
  private boolean checkCompounds(CharsRef word, WordCase originalCase, CompoundPart prev) {
    if (prev != null && prev.index > dictionary.compoundMax - 2) return false;

@ -424,12 +430,20 @@ public class SpellChecker {
    }

    WordCase wordCase = WordCase.caseOf(word);
-    ModifyingSuggester modifier = new ModifyingSuggester(this);
+    SpellChecker suggestionSpeller =
+        new SpellChecker(dictionary) {
+          @Override
+          boolean acceptsStem(int formID) {
+            return !dictionary.hasFlag(formID, dictionary.noSuggest)
+                && !dictionary.hasFlag(formID, dictionary.subStandard);
+          }
+        };
+    ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller);
    Set<String> suggestions = modifier.suggest(word, wordCase);

    if (!modifier.hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
      suggestions.addAll(
-          new GeneratingSuggester(this)
+          new GeneratingSuggester(suggestionSpeller)
              .suggest(dictionary.toLowerCase(word), wordCase, suggestions));
    }

--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -100,8 +100,8 @@ final class Stemmer {

    List<CharsRef> list = new ArrayList<>();
    RootProcessor processor =
-        (stem, forms, formID) -> {
-          list.add(newStem(stem, forms, formID));
+        (stem, formID, stemException) -> {
+          list.add(newStem(stem, stemException));
          return true;
        };

@ -273,7 +273,7 @@ final class Stemmer {
            continue;
          }
        }
-        if (!processor.processRoot(new CharsRef(word, offset, length), forms, i)) {
+        if (!callProcessor(word, offset, length, processor, forms, i)) {
          return false;
        }
      }
@ -344,23 +344,27 @@ final class Stemmer {
  }

  interface RootProcessor {
-    /** @return whether the processing should be continued */
-    boolean processRoot(CharsRef stem, IntsRef forms, int formID);
+    /**
+     * @param stem the text of the found dictionary entry
+     * @param formID internal id of the dictionary entry, e.g. to be used in {@link
+     *     Dictionary#hasFlag(int, char)}
+     * @param stemException "st:" morphological data if present, {@code null} otherwise
+     * @return whether the processing should be continued
+     */
+    boolean processRoot(CharsRef stem, int formID, String stemException);
  }

-  private CharsRef newStem(CharsRef stem, IntsRef forms, int formID) {
-    final String exception;
+  private String stemException(IntsRef forms, int formIndex) {
    if (dictionary.hasStemExceptions) {
-      int exceptionID = forms.ints[forms.offset + formID + 1];
+      int exceptionID = forms.ints[forms.offset + formIndex + 1];
      if (exceptionID > 0) {
-        exception = dictionary.getStemException(exceptionID);
-      } else {
-        exception = null;
+        return dictionary.getStemException(exceptionID);
      }
-    } else {
-      exception = null;
+    }
+    return null;
  }

+  private CharsRef newStem(CharsRef stem, String exception) {
    if (dictionary.needsOutputCleaning) {
      scratchSegment.setLength(0);
      if (exception != null) {
@ -704,7 +708,7 @@ final class Stemmer {
              continue;
            }
          }
-          if (!processor.processRoot(new CharsRef(strippedWord, offset, length), forms, i)) {
+          if (!callProcessor(strippedWord, offset, length, processor, forms, i)) {
            return false;
          }
        }
@ -757,6 +761,12 @@ final class Stemmer {
    return true;
  }

+  private boolean callProcessor(
+      char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
+    CharsRef stem = new CharsRef(word, offset, length);
+    return processor.processRoot(stem, forms.ints[forms.offset + i], stemException(forms, i));
+  }
+
  private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix, int prefixId) {
    char circumfix = dictionary.circumfix;
    // if circumfix was previously set by a prefix, we must check this suffix,
@ -765,7 +775,6 @@ final class Stemmer {
        && isFlagAppendedByAffix(prefixId, circumfix) != isFlagAppendedByAffix(affix, circumfix)) {
      return true;
    }
-
    if (isFlagAppendedByAffix(affix, dictionary.needaffix)) {
      return !isSuffix
          || previousAffix < 0
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@ -196,6 +196,10 @@ public class SpellCheckerTest extends StemmerTestBase {
    doTest("map");
  }

+  public void testNoSuggest() throws Exception {
+    doTest("nosuggest");
+  }
+
  protected void doTest(String name) throws Exception {
    checkSpellCheckerExpectations(
        Path.of(getClass().getResource(name + ".aff").toURI()).getParent().resolve(name));
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/nosuggest.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/nosuggest.aff
@ -0,0 +1,5 @@
+# don't suggest word with NOSUGGEST flag (for example vulgar or obscene words)
+# See OpenOffice.org Issue #55498
+# (nosuggest.sug is an empty file)
+NOSUGGEST A
+COMPOUNDFLAG B
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/nosuggest.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/nosuggest.dic
@ -0,0 +1,3 @@
+1
+foo/AB
+bar/B
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/nosuggest.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/nosuggest.good
@ -0,0 +1,3 @@
+foo
+foobar
+barfoo
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/nosuggest.sug
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/nosuggest.sug
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/nosuggest.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/nosuggest.wrong
@ -0,0 +1,3 @@
+foox
+foobarx
+barfoox