LUCENE-9765: Hunspell: rename SpellChecker to Hunspell, fix test name, update javadoc and CHANGES.txt (#2354)

2021-02-12 15:44:36 +01:00 · 2021-02-12 15:44:36 +01:00 · 02ea7a1139
parent 9905c0cc2d
commit 02ea7a1139
8 changed files with 31 additions and 23 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -89,8 +89,8 @@ API Changes

 Improvements

-* LUCENE-9687: Hunspell support improvements: add SpellChecker API, support default encoding and
-  BREAK/FORBIDDENWORD/COMPOUNDRULE affix rules, improve stemming of all-caps words (Peter Gromov)
+* LUCENE-9687: Hunspell support improvements: add API for spell-checking and suggestions, support compound words,
+  fix various behavior differences between Java and C++ implementations, improve performance (Peter Gromov, Dawid Weiss)

 * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
  (Dawid Weiss)
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@ -43,9 +43,9 @@ class GeneratingSuggester {
  private static final int MAX_WORDS = 100;
  private static final int MAX_GUESSES = 200;
  private final Dictionary dictionary;
-  private final SpellChecker speller;
+  private final Hunspell speller;

-  GeneratingSuggester(SpellChecker speller) {
+  GeneratingSuggester(Hunspell speller) {
    this.dictionary = speller.dictionary;
    this.speller = speller;
  }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@ -34,15 +34,25 @@ import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;

 /**
- * A spell checker based on Hunspell dictionaries. The objects of this class are not thread-safe
- * (but a single underlying Dictionary can be shared by multiple spell-checkers in different
- * threads). Not all Hunspell features are supported yet.
+ * A spell checker based on Hunspell dictionaries. This class can be used in place of native
+ * Hunspell for many languages for spell-checking and suggesting purposes. Note that not all
+ * languages are supported yet. For example:
+ *
+ * <ul>
+ *   <li>Hungarian (as it doesn't only rely on dictionaries, but has some logic directly in the
+ *       source code
+ *   <li>Languages with Unicode characters outside of the Basic Multilingual Plane
+ *   <li>PHONE affix file option for suggestions
+ * </ul>
+ *
+ * <p>The objects of this class are not thread-safe (but a single underlying Dictionary can be
+ * shared by multiple spell-checkers in different threads).
 */
-public class SpellChecker {
+public class Hunspell {
  final Dictionary dictionary;
  final Stemmer stemmer;

-  public SpellChecker(Dictionary dictionary) {
+  public Hunspell(Dictionary dictionary) {
    this.dictionary = dictionary;
    stemmer = new Stemmer(dictionary);
  }
@ -448,8 +458,8 @@ public class SpellChecker {
      }
    }

-    SpellChecker suggestionSpeller =
-        new SpellChecker(dictionary) {
+    Hunspell suggestionSpeller =
+        new Hunspell(dictionary) {
          @Override
          boolean acceptsStem(int formID) {
            return !dictionary.hasFlag(formID, dictionary.noSuggest)
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@ -28,10 +28,10 @@ class ModifyingSuggester {
  private static final int MAX_CHAR_DISTANCE = 4;
  private final LinkedHashSet<String> result = new LinkedHashSet<>();
  private final char[] tryChars;
-  private final SpellChecker speller;
+  private final Hunspell speller;
  boolean hasGoodSuggestions;

-  ModifyingSuggester(SpellChecker speller) {
+  ModifyingSuggester(Hunspell speller) {
    this.speller = speller;
    tryChars = speller.dictionary.tryChars.toCharArray();
  }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/package-info.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/package-info.java
@ -17,13 +17,11 @@

 /**
 * A Java implementation of <a href="http://hunspell.github.io/">Hunspell</a> stemming and
- * spell-checking algorithms, and a stemming TokenFilter based on it.
+ * spell-checking algorithms ({@link org.apache.lucene.analysis.hunspell.Hunspell}), and a stemming
+ * TokenFilter ({@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}) based on it.
 *
 * <p>For dictionaries, see e.g. <a href="https://github.com/LibreOffice/dictionaries">LibreOffice
 * repository</a> or <a href="https://github.com/wooorm/dictionaries">Titus Wormer's collection
 * (UTF)</a>
- *
- * @see org.apache.lucene.analysis.hunspell.HunspellStemFilter
- * @see org.apache.lucene.analysis.hunspell.SpellChecker
 */
 package org.apache.lucene.analysis.hunspell;
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellRepositoryTestCases.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellRepositoryTestCases.java
@ -32,7 +32,7 @@ import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;

 /**
- * Same as {@link SpellCheckerTest}, but checks all Hunspell's test data. The path to the checked
+ * Same as {@link TestSpellChecking}, but checks all Hunspell's test data. The path to the checked
 * out Hunspell repository should be in {@code hunspell.repo.path} system property.
 */
@RunWith(Parameterized.class)
@ -78,7 +78,7 @@ public class TestHunspellRepositoryTestCases {

  @Test
  public void test() throws Throwable {
-    ThrowingRunnable test = () -> SpellCheckerTest.checkSpellCheckerExpectations(pathPrefix);
+    ThrowingRunnable test = () -> TestSpellChecking.checkSpellCheckerExpectations(pathPrefix);
    if (EXPECTED_FAILURES.contains(testName)) {
      Assert.assertThrows(Throwable.class, test);
    } else {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@ -76,7 +76,7 @@ public class TestPerformance extends LuceneTestCase {
    List<String> words = loadWords(code, wordCount, dictionary);

    Stemmer stemmer = new Stemmer(dictionary);
-    SpellChecker speller = new SpellChecker(dictionary);
+    Hunspell speller = new Hunspell(dictionary);
    measure(
        "Stemming " + code,
        blackHole -> {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
@ -26,7 +26,7 @@ import java.util.stream.Collectors;
 import org.apache.lucene.store.ByteBuffersDirectory;
 import org.apache.lucene.util.IOUtils;

-public class SpellCheckerTest extends StemmerTestBase {
+public class TestSpellChecking extends StemmerTestBase {

  public void testBase() throws Exception {
    doTest("base");
@ -221,11 +221,11 @@ public class SpellCheckerTest extends StemmerTestBase {
    InputStream affixStream = Files.newInputStream(Path.of(basePath.toString() + ".aff"));
    InputStream dictStream = Files.newInputStream(Path.of(basePath.toString() + ".dic"));

-    SpellChecker speller;
+    Hunspell speller;
    try {
      Dictionary dictionary =
          new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
-      speller = new SpellChecker(dictionary);
+      speller = new Hunspell(dictionary);
    } finally {
      IOUtils.closeWhileHandlingException(affixStream);
      IOUtils.closeWhileHandlingException(dictStream);