LUCENE-9763: Hunspell: fix FORBIDDENWORD support (#2351)

don't decompound if it's a simple word with a forbidden root, don't lookup the word twice, don't forbid stemming (be like Hunspell)
2021-02-11 15:16:40 +01:00 · 2021-02-11 15:16:40 +01:00 · 019872453d
parent 01e34f8723
commit 019872453d
21 changed files with 186 additions and 82 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -1407,14 +1407,6 @@ public class Dictionary {
        .collect(Collectors.toList());
  }

-  boolean isForbiddenWord(char[] word, int length) {
-    if (forbiddenword != FLAG_UNSET) {
-      IntsRef forms = lookupWord(word, 0, length);
-      return forms != null && hasFlag(forms, forbiddenword);
-    }
-    return false;
-  }
-
  boolean hasFlag(IntsRef forms, char flag) {
    int formStep = formStep();
    for (int i = 0; i < forms.length; i += formStep) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@ -51,22 +51,22 @@ class GeneratingSuggester {
  }

  List<String> suggest(String word, WordCase originalCase, Set<String> prevSuggestions) {
-    List<Weighted<DictEntry>> roots = findSimilarDictionaryEntries(word, originalCase);
+    List<Weighted<Root<String>>> roots = findSimilarDictionaryEntries(word, originalCase);
    List<Weighted<String>> expanded = expandRoots(word, roots);
    TreeSet<Weighted<String>> bySimilarity = rankBySimilarity(word, expanded);
    return getMostRelevantSuggestions(bySimilarity, prevSuggestions);
  }

-  private List<Weighted<DictEntry>> findSimilarDictionaryEntries(
+  private List<Weighted<Root<String>>> findSimilarDictionaryEntries(
      String word, WordCase originalCase) {
-    TreeSet<Weighted<DictEntry>> roots = new TreeSet<>();
+    TreeSet<Weighted<Root<String>>> roots = new TreeSet<>();
    processFST(
        dictionary.words,
        (key, forms) -> {
          if (Math.abs(key.length - word.length()) > 4) return;

          String root = toString(key);
-          List<DictEntry> entries = filterSuitableEntries(root, forms);
+          List<Root<String>> entries = filterSuitableEntries(root, forms);
          if (entries.isEmpty()) return;

          if (originalCase == WordCase.LOWER
@ -106,8 +106,8 @@ class GeneratingSuggester {
    return new String(chars);
  }

-  private List<DictEntry> filterSuitableEntries(String word, IntsRef forms) {
-    List<DictEntry> result = new ArrayList<>();
+  private List<Root<String>> filterSuitableEntries(String word, IntsRef forms) {
+    List<Root<String>> result = new ArrayList<>();
    for (int i = 0; i < forms.length; i += dictionary.formStep()) {
      int entryId = forms.ints[forms.offset + i];
      if (dictionary.hasFlag(entryId, dictionary.forbiddenword)
@ -116,17 +116,18 @@ class GeneratingSuggester {
          || dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
        continue;
      }
-      result.add(new DictEntry(word, entryId));
+      result.add(new Root<>(word, entryId));
    }

    return result;
  }

-  private List<Weighted<String>> expandRoots(String misspelled, List<Weighted<DictEntry>> roots) {
+  private List<Weighted<String>> expandRoots(
+      String misspelled, List<Weighted<Root<String>>> roots) {
    int thresh = calcThreshold(misspelled);

    TreeSet<Weighted<String>> expanded = new TreeSet<>();
-    for (Weighted<DictEntry> weighted : roots) {
+    for (Weighted<Root<String>> weighted : roots) {
      for (String guess : expandRoot(weighted.word, misspelled)) {
        String lower = dictionary.toLowerCase(guess);
        int sc =
@ -156,7 +157,7 @@ class GeneratingSuggester {
    return thresh / 3 - 1;
  }

-  private List<String> expandRoot(DictEntry root, String misspelled) {
+  private List<String> expandRoot(Root<String> root, String misspelled) {
    List<String> crossProducts = new ArrayList<>();
    Set<String> result = new LinkedHashSet<>();

@ -226,7 +227,7 @@ class GeneratingSuggester {
    return result.stream().limit(MAX_WORDS).collect(Collectors.toList());
  }

-  private boolean hasCompatibleFlags(DictEntry root, int affixId) {
+  private boolean hasCompatibleFlags(Root<?> root, int affixId) {
    if (!dictionary.hasFlag(root.entryId, dictionary.affixData(affixId, AFFIX_FLAG))) {
      return false;
    }
@ -434,37 +435,4 @@ class GeneratingSuggester {
      return cmp != 0 ? -cmp : word.compareTo(o.word);
    }
  }
-
-  private static class DictEntry implements Comparable<DictEntry> {
-    private final String word;
-    private final int entryId;
-
-    DictEntry(String word, int entryId) {
-      this.word = word;
-      this.entryId = entryId;
-    }
-
-    @Override
-    public String toString() {
-      return word;
-    }
-
-    @Override
-    public boolean equals(Object o) {
-      if (this == o) return true;
-      if (!(o instanceof DictEntry)) return false;
-      DictEntry dictEntry = (DictEntry) o;
-      return entryId == dictEntry.entryId && word.equals(dictEntry.word);
-    }
-
-    @Override
-    public int hashCode() {
-      return Objects.hash(word, entryId);
-    }
-
-    @Override
-    public int compareTo(DictEntry o) {
-      return word.compareTo(o.word);
-    }
-  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Root.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Root.java
@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.util.Objects;
+
+class Root<T extends CharSequence> implements Comparable<Root<T>> {
+  final T word;
+  final int entryId;
+
+  Root(T word, int entryId) {
+    this.word = word;
+    this.entryId = entryId;
+  }
+
+  @Override
+  public String toString() {
+    return word.toString();
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) return true;
+    if (!(o instanceof Root)) return false;
+    @SuppressWarnings("unchecked")
+    Root<T> root = (Root<T>) o;
+    return entryId == root.entryId && word.equals(root.word);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(word, entryId);
+  }
+
+  @Override
+  public int compareTo(Root<T> o) {
+    return CharSequence.compare(word, o.word);
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@ -68,11 +68,12 @@ public class SpellChecker {
    }

    char[] wordChars = word.toCharArray();
-    if (dictionary.isForbiddenWord(wordChars, wordChars.length)) {
-      return false;
+    Boolean simpleResult = checkSimpleWord(wordChars, wordChars.length, null);
+    if (simpleResult != null) {
+      return simpleResult;
    }

-    if (checkWord(wordChars, wordChars.length, null)) {
+    if (checkCompounds(wordChars, wordChars.length, null)) {
      return true;
    }

@ -105,12 +106,9 @@ public class SpellChecker {
  }

  Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) {
-    if (dictionary.isForbiddenWord(wordChars, length)) {
-      return false;
-    }
-
-    if (findStem(wordChars, 0, length, originalCase, SIMPLE_WORD) != null) {
-      return true;
+    Root<CharsRef> entry = findStem(wordChars, 0, length, originalCase, SIMPLE_WORD);
+    if (entry != null) {
+      return !dictionary.hasFlag(entry.entryId, dictionary.forbiddenword);
    }

    return null;
@ -122,6 +120,10 @@ public class SpellChecker {
      return simpleResult;
    }

+    return checkCompounds(wordChars, length, originalCase);
+  }
+
+  private boolean checkCompounds(char[] wordChars, int length, WordCase originalCase) {
    if (dictionary.compoundRules != null
        && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
      return true;
@ -134,9 +136,10 @@ public class SpellChecker {
    return false;
  }

-  private CharsRef findStem(
+  private Root<CharsRef> findStem(
      char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) {
-    CharsRef[] result = {null};
+    @SuppressWarnings({"rawtypes", "unchecked"})
+    Root<CharsRef>[] result = new Root[1];
    stemmer.doStem(
        wordChars,
        offset,
@ -145,7 +148,7 @@ public class SpellChecker {
        context,
        (stem, formID, stemException) -> {
          if (acceptsStem(formID)) {
-            result[0] = stem;
+            result[0] = new Root<>(stem, formID);
          }
          return false;
        });
@ -164,13 +167,15 @@ public class SpellChecker {
      WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
      int breakOffset = word.offset + breakPos;
      if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) {
-        CharsRef stem = findStem(word.chars, word.offset, breakPos, originalCase, context);
+        Root<CharsRef> stem = findStem(word.chars, word.offset, breakPos, originalCase, context);
        if (stem == null
            && dictionary.simplifiedTriple
            && word.chars[breakOffset - 1] == word.chars[breakOffset]) {
          stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context);
        }
-        if (stem != null && (prev == null || prev.mayCompound(stem, breakPos, originalCase))) {
+        if (stem != null
+            && !dictionary.hasFlag(stem.entryId, dictionary.forbiddenword)
+            && (prev == null || prev.mayCompound(stem, breakPos, originalCase))) {
          CompoundPart part = new CompoundPart(prev, word, breakPos, stem, null);
          if (checkCompoundsAfter(originalCase, part)) {
            return true;
@ -193,7 +198,8 @@ public class SpellChecker {
      if (expanded != null) {
        WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
        int breakPos = pos + pattern.endLength();
-        CharsRef stem = findStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
+        Root<CharsRef> stem =
+            findStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
        if (stem != null) {
          CompoundPart part = new CompoundPart(prev, expanded, breakPos, stem, pattern);
          if (checkCompoundsAfter(originalCase, part)) {
@ -210,10 +216,11 @@ public class SpellChecker {
    int breakPos = prev.length;
    int remainingLength = word.length - breakPos;
    int breakOffset = word.offset + breakPos;
-    CharsRef tailStem =
+    Root<CharsRef> tailStem =
        findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
    if (tailStem != null
-        && !(dictionary.checkCompoundDup && equalsIgnoreCase(prev.stem, tailStem))
+        && !dictionary.hasFlag(tailStem.entryId, dictionary.forbiddenword)
+        && !(dictionary.checkCompoundDup && equalsIgnoreCase(prev.stem, tailStem.word))
        && !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase)
        && prev.mayCompound(tailStem, remainingLength, originalCase)) {
      return true;
@ -232,7 +239,7 @@ public class SpellChecker {
    return forms != null && dictionary.hasFlag(forms, dictionary.forceUCase);
  }

-  private boolean equalsIgnoreCase(CharsRef cr1, CharsRef cr2) {
+  private boolean equalsIgnoreCase(CharSequence cr1, CharSequence cr2) {
    return cr1.toString().equalsIgnoreCase(cr2.toString());
  }

@ -243,11 +250,15 @@ public class SpellChecker {
    final CheckCompoundPattern enablingPattern;

    CompoundPart(
-        CompoundPart prev, CharsRef tail, int length, CharsRef stem, CheckCompoundPattern enabler) {
+        CompoundPart prev,
+        CharsRef tail,
+        int length,
+        Root<CharsRef> stem,
+        CheckCompoundPattern enabler) {
      this.prev = prev;
      this.tail = tail;
      this.length = length;
-      this.stem = stem;
+      this.stem = stem.word;
      index = prev == null ? 1 : prev.index + 1;
      enablingPattern = enabler;
    }
@ -257,12 +268,12 @@ public class SpellChecker {
      return (prev == null ? "" : prev + "+") + tail.subSequence(0, length);
    }

-    boolean mayCompound(CharsRef nextStem, int nextPartLength, WordCase originalCase) {
+    boolean mayCompound(Root<CharsRef> nextStem, int nextPartLength, WordCase originalCase) {
      boolean patternsOk =
          enablingPattern != null
-              ? enablingPattern.prohibitsCompounding(tail, length, stem, nextStem)
+              ? enablingPattern.prohibitsCompounding(tail, length, stem, nextStem.word)
              : dictionary.checkCompoundPatterns.stream()
-                  .noneMatch(p -> p.prohibitsCompounding(tail, length, stem, nextStem));
+                  .noneMatch(p -> p.prohibitsCompounding(tail, length, stem, nextStem.word));
      if (!patternsOk) {
        return false;
      }
@ -498,7 +509,7 @@ public class SpellChecker {
        if (!spell(chunk)) {
          for (String chunkSug : suggest(chunk)) {
            String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd);
-            if (!dictionary.isForbiddenWord(replaced.toCharArray(), replaced.length())) {
+            if (spell(replaced)) {
              result.add(replaced);
            }
          }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.hunspell;

 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Collections;
 import java.util.List;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
@ -94,10 +93,6 @@ final class Stemmer {
      word = scratchBuffer;
    }

-    if (dictionary.isForbiddenWord(word, length)) {
-      return Collections.emptyList();
-    }
-
    List<CharsRef> list = new ArrayList<>();
    RootProcessor processor =
        (stem, formID, stemException) -> {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@ -172,6 +172,18 @@ public class SpellCheckerTest extends StemmerTestBase {
    doTest("onlyincompound2");
  }

+  public void testForbiddenWord() throws Exception {
+    doTest("forbiddenword");
+  }
+
+  public void testForbiddenWord1() throws Exception {
+    doTest("opentaal_forbiddenword1");
+  }
+
+  public void testForbiddenWord2() throws Exception {
+    doTest("opentaal_forbiddenword2");
+  }
+
  public void testGermanCompounding() throws Exception {
    doTest("germancompounding");
  }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDutchIJ.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDutchIJ.java
@ -27,6 +27,5 @@ public class TestDutchIJ extends StemmerTestBase {
  public void testStemming() {
    assertStemsTo("ijs", "ijs");
    assertStemsTo("IJs", "ijs");
-    assertStemsTo("Ijs");
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.aff
@ -0,0 +1,11 @@
+# FORBIDDENWORD flag
+# The signed word, and its suffixed forms are all forbidden,
+# excepts with root homonyms.
+# Useful for forbidding bad suffixed forms or compounds.
+
+
+FORBIDDENWORD X
+COMPOUNDFLAG Y
+
+SFX A Y 1
+SFX A 0 s .
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic
@ -0,0 +1,11 @@
+10
+foo/S
+foo/YX
+bar/YS
+bars/X
+foos/X
+kg
+Kg/X
+KG/X
+cm
+Cm/X
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.good
@ -0,0 +1,3 @@
+foo
+bar
+
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.wrong
@ -0,0 +1,4 @@
+bars
+foos
+foobar
+barfoo
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.aff
@ -0,0 +1,9 @@
+TRY r
+
+FORBIDDENWORD F
+COMPOUNDRULE 2
+COMPOUNDRULE WW
+COMPOUNDRULE WWW
+
+SFX S Y 1
+SFX S 0 s .
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.dic
@ -0,0 +1,5 @@
+4
+foo/W
+word/W
+bar/WS
+foowordbar/FS
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.good
@ -0,0 +1,3 @@
+fooword
+wordbar
+barwordfoo
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.sug
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.sug
@ -0,0 +1 @@
+barwordfoo
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.wrong
@ -0,0 +1,5 @@
+foowordbar
+foowordbars
+foowordba
+foowordbas
+barwodfoo
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.aff
@ -0,0 +1,7 @@
+TRY r
+
+FORBIDDENWORD F
+COMPOUNDFLAG W
+
+SFX S Y 1
+SFX S 0 s .
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.dic
@ -0,0 +1,5 @@
+3
+foo/WS
+word/W
+bar/WS
+foowordbar/FS
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.good
@ -0,0 +1,4 @@
+fooword
+wordbar
+barwordfoo
+barwordfoos
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.sug
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.sug
@ -0,0 +1 @@
+barwordfoo
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.wrong
@ -0,0 +1,5 @@
+foowordbar
+foowordbars
+foowordba
+foowordbas
+barwodfoo