LUCENE-9667: Hunspell: add spellchecker API, support BREAK and FORBIDDENWORD affix rules (#2207)

2021-01-20 10:57:27 +01:00 · 2021-01-20 10:57:27 +01:00 · 939699f550
parent a233ed2fd1
commit 939699f550
18 changed files with 344 additions and 3 deletions
--- a/gradle/validation/rat-sources.gradle
+++ b/gradle/validation/rat-sources.gradle
@ -54,6 +54,8 @@ configure(project(":lucene:analysis:common")) {
        srcExcludes += [
            "**/*.aff",
            "**/*.dic",
            "**/*.wrong",
            "**/*.good",
            "**/charfilter/*.htm*",
            "**/*LuceneResourcesWikiPage.html"
        ]
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -86,8 +86,8 @@ API Changes
 Improvements
-* LUCENE-9665 LUCENE-9676 Hunspell improvements: support default encoding, improve stemming of all-caps words
+* LUCENE-9665 LUCENE-9676 LUCENE-9667 : Hunspell improvements: add SpellChecker API, support default encoding and
-  (Peter Gromov)
+  BREAK/FORBIDDENWORD affix rules, improve stemming of all-caps words (Peter Gromov)
 * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
  (Dawid Weiss)
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -34,13 +34,16 @@ import java.nio.file.Paths;
 import java.text.ParseException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.LinkedHashMap;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@ -87,6 +90,8 @@ public class Dictionary {
  private static final String OCONV_KEY = "OCONV";
  private static final String FULLSTRIP_KEY = "FULLSTRIP";
  private static final String LANG_KEY = "LANG";
  private static final String BREAK_KEY = "BREAK";
  private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD";
  private static final String KEEPCASE_KEY = "KEEPCASE";
  private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
  private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
@ -103,6 +108,7 @@ public class Dictionary {
  FST<IntsRef> prefixes;
  FST<IntsRef> suffixes;
  Breaks breaks = Breaks.DEFAULT;
  // all condition checks used by prefixes and suffixes. these are typically re-used across
  // many affix stripping rules. so these are deduplicated, to save RAM.
@ -155,6 +161,7 @@ public class Dictionary {
  int circumfix = -1; // circumfix flag, or -1 if one is not defined
  int keepcase = -1; // keepcase flag, or -1 if one is not defined
  int needaffix = -1; // needaffix flag, or -1 if one is not defined
  int forbiddenword = -1; // forbiddenword flag, or -1 if one is not defined
  int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined
  // ignored characters (dictionary, affix, inputs)
@ -256,6 +263,10 @@ public class Dictionary {
    }
  }
  int formStep() {
    return hasStemExceptions ? 2 : 1;
  }
  /** Looks up Hunspell word forms from the dictionary */
  IntsRef lookupWord(char[] word, int offset, int length) {
    return lookup(words, word, offset, length);
@ -400,6 +411,14 @@ public class Dictionary {
      } else if (line.startsWith(LANG_KEY)) {
        language = line.substring(LANG_KEY.length()).trim();
        alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language);
      } else if (line.startsWith(BREAK_KEY)) {
        breaks = parseBreaks(reader, line);
      } else if (line.startsWith(FORBIDDENWORD_KEY)) {
        String[] parts = line.split("\\s+");
        if (parts.length != 2) {
          throw new ParseException("Illegal FORBIDDENWORD declaration", reader.getLineNumber());
        }
        forbiddenword = flagParsingStrategy.parseFlag(parts[1]);
      }
    }
@ -423,6 +442,30 @@ public class Dictionary {
    stripOffsets[currentIndex] = currentOffset;
  }
  private Breaks parseBreaks(LineNumberReader reader, String line)
      throws IOException, ParseException {
    Set<String> starting = new LinkedHashSet<>();
    Set<String> ending = new LinkedHashSet<>();
    Set<String> middle = new LinkedHashSet<>();
    int num = Integer.parseInt(line.substring(BREAK_KEY.length()).trim());
    for (int i = 0; i < num; i++) {
      line = reader.readLine();
      String[] parts = line.split("\\s+");
      if (!line.startsWith(BREAK_KEY) || parts.length != 2) {
        throw new ParseException("BREAK chars expected", reader.getLineNumber());
      }
      String breakStr = parts[1];
      if (breakStr.startsWith("^")) {
        starting.add(breakStr.substring(1));
      } else if (breakStr.endsWith("$")) {
        ending.add(breakStr.substring(0, breakStr.length() - 1));
      } else {
        middle.add(breakStr);
      }
    }
    return new Breaks(starting, ending, middle);
  }
  private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException {
    IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
    FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
@ -1143,6 +1186,22 @@ public class Dictionary {
    return null;
  }
  boolean isForbiddenWord(char[] word, BytesRef scratch) {
    if (forbiddenword != -1) {
      IntsRef forms = lookupWord(word, 0, word.length);
      if (forms != null) {
        int formStep = formStep();
        for (int i = 0; i < forms.length; i += formStep) {
          flagLookup.get(forms.ints[forms.offset + i], scratch);
          if (hasFlag(Dictionary.decodeFlags(scratch), (char) forbiddenword)) {
            return true;
          }
        }
      }
    }
    return false;
  }
  /** Abstraction of the process of parsing flags taken from the affix and dic files */
  abstract static class FlagParsingStrategy {
@ -1371,4 +1430,21 @@ public class Dictionary {
    return DEFAULT_TEMP_DIR;
  }
  /** Possible word breaks according to BREAK directives */
  static class Breaks {
    private static final Set<String> MINUS = Collections.singleton("-");
    static final Breaks DEFAULT = new Breaks(MINUS, MINUS, MINUS);
    final String[] starting, ending, middle;
    Breaks(Collection<String> starting, Collection<String> ending, Collection<String> middle) {
      this.starting = starting.toArray(new String[0]);
      this.ending = ending.toArray(new String[0]);
      this.middle = middle.toArray(new String[0]);
    }
    boolean isNotEmpty() {
      return middle.length > 0 || starting.length > 0 || ending.length > 0;
    }
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@ -0,0 +1,104 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.hunspell;
 import org.apache.lucene.util.BytesRef;
 /**
 * A spell checker based on Hunspell dictionaries. The objects of this class are not thread-safe
 * (but a single underlying Dictionary can be shared by multiple spell-checkers in different
 * threads). Not all Hunspell features are supported yet.
 */
 public class SpellChecker {
  private final Dictionary dictionary;
  private final BytesRef scratch = new BytesRef();
  private final Stemmer stemmer;
  public SpellChecker(Dictionary dictionary) {
    this.dictionary = dictionary;
    stemmer = new Stemmer(dictionary);
  }
  /** @return whether the given word's spelling is considered correct according to Hunspell rules */
  public boolean spell(String word) {
    char[] wordChars = word.toCharArray();
    if (dictionary.isForbiddenWord(wordChars, scratch)) {
      return false;
    }
    if (!stemmer.stem(wordChars, word.length()).isEmpty()) {
      return true;
    }
    if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
      return tryBreaks(word);
    }
    return false;
  }
  private boolean tryBreaks(String word) {
    for (String br : dictionary.breaks.starting) {
      if (word.length() > br.length() && word.startsWith(br)) {
        if (spell(word.substring(br.length()))) {
          return true;
        }
      }
    }
    for (String br : dictionary.breaks.ending) {
      if (word.length() > br.length() && word.endsWith(br)) {
        if (spell(word.substring(0, word.length() - br.length()))) {
          return true;
        }
      }
    }
    for (String br : dictionary.breaks.middle) {
      int pos = word.indexOf(br);
      if (canBeBrokenAt(word, br, pos)) {
        return true;
      }
      // try to break at the second occurrence
      // to recognize dictionary words with a word break
      if (pos > 0 && canBeBrokenAt(word, br, word.indexOf(br, pos + 1))) {
        return true;
      }
    }
    return false;
  }
  private boolean hasTooManyBreakOccurrences(String word) {
    int occurrences = 0;
    for (String br : dictionary.breaks.middle) {
      int pos = 0;
      while ((pos = word.indexOf(br, pos)) >= 0) {
        if (++occurrences >= 10) return true;
        pos += br.length();
      }
    }
    return false;
  }
  private boolean canBeBrokenAt(String word, String breakStr, int breakPos) {
    return breakPos > 0
        && breakPos < word.length() - breakStr.length()
        && spell(word.substring(0, breakPos))
        && spell(word.substring(breakPos + breakStr.length()));
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -64,7 +64,7 @@ final class Stemmer {
        suffixReaders[level] = dictionary.suffixes.getBytesReader();
      }
    }
-    formStep = dictionary.hasStemExceptions ? 2 : 1;
+    formStep = dictionary.formStep();
  }
  /**
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@ -0,0 +1,71 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.hunspell;
 import java.io.InputStream;
 import java.net.URL;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.Objects;
 import org.apache.lucene.store.ByteBuffersDirectory;
 import org.apache.lucene.util.IOUtils;
 public class SpellCheckerTest extends StemmerTestBase {
  public void testBreak() throws Exception {
    doTest("break");
  }
  public void testBreakDefault() throws Exception {
    doTest("breakdefault");
  }
  public void testBreakOff() throws Exception {
    doTest("breakoff");
  }
  protected void doTest(String name) throws Exception {
    InputStream affixStream =
        Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name);
    InputStream dictStream =
        Objects.requireNonNull(getClass().getResourceAsStream(name + ".dic"), name);
    SpellChecker speller;
    try {
      Dictionary dictionary =
          new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
      speller = new SpellChecker(dictionary);
    } finally {
      IOUtils.closeWhileHandlingException(affixStream);
      IOUtils.closeWhileHandlingException(dictStream);
    }
    URL good = StemmerTestBase.class.getResource(name + ".good");
    if (good != null) {
      for (String word : Files.readAllLines(Path.of(good.toURI()))) {
        assertTrue("Unexpectedly considered misspelled: " + word, speller.spell(word));
      }
    }
    URL wrong = StemmerTestBase.class.getResource(name + ".wrong");
    if (wrong != null) {
      for (String word : Files.readAllLines(Path.of(wrong.toURI()))) {
        assertFalse("Unexpectedly considered correct: " + word, speller.spell(word));
      }
    }
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.aff
@ -0,0 +1,10 @@
 # word break points test, recursive break at dash and n-dash
 SET UTF-8
 BREAK 2
 BREAK -
 BREAK –
 WORDCHARS -–
 FORBIDDENWORD !
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.dic
@ -0,0 +1,7 @@
 6
 foo
 bar
 baz
 fox-bax
 foo-baz/!
 e-mail
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.good
@ -0,0 +1,12 @@
 foo
 bar
 fox-bax
 foo-bar
 foo–bar
 foo-bar-foo-bar
 foo-bar–foo-bar
 bar-baz
 baz-foo
 foo-bar-foo-bar-foo-bar-foo-bar-foo-bar
 e-mail
 e-mail-foo
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.wrong
@ -0,0 +1,13 @@
 fox
 bax
 -foo
 bar-
 fox-bar
 foo-bax
 foo–bax
 fox–bar
 foo-bar-fox-bar
 foo-bax-foo-bar
 foo-bar–fox-bar
 foo-bax–foo-bar
 foo-baz
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.aff
@ -0,0 +1,6 @@
 # default word break at hyphens and n-dashes
 SET UTF-8
 MAXNGRAMSUGS 0
 WORDCHARS -
 TRY ot
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.dic
@ -0,0 +1,6 @@
 3
 foo
 bar
 free
 scott
 scot-free
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.good
@ -0,0 +1,7 @@
 foo
 bar
 foo-
 -foo
 scot-free
 foo-bar
 foo-bar-foo-bar
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.wrong
@ -0,0 +1,6 @@
 scot
 sco-free
 fo-bar
 foo-fo-bar
 foo-foo-fo
 -
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.aff
@ -0,0 +1,7 @@
 # switch off default word break at hyphens and n-dashes by BREAK 0
 SET UTF-8
 MAXNGRAMSUGS 0
 WORDCHARS -
 TRY ot
 BREAK 0
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.dic
@ -0,0 +1,6 @@
 3
 foo
 bar
 free
 scott
 scot-free
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.good
@ -0,0 +1,3 @@
 foo
 bar
 scot-free
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.wrong
@ -0,0 +1,5 @@
 foo-
 -foo
 foo-bar
 foo-bar-foo-bar
 scot