LUCENE-9765: Hunspell: rename SpellChecker to Hunspell, fix test name, update javadoc and CHANGES.txt (#2354)

This commit is contained in:
Peter Gromov 2021-02-12 15:44:36 +01:00 committed by GitHub
parent 9905c0cc2d
commit 02ea7a1139
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 31 additions and 23 deletions

View File

@ -89,8 +89,8 @@ API Changes
Improvements
* LUCENE-9687: Hunspell support improvements: add SpellChecker API, support default encoding and
BREAK/FORBIDDENWORD/COMPOUNDRULE affix rules, improve stemming of all-caps words (Peter Gromov)
* LUCENE-9687: Hunspell support improvements: add API for spell-checking and suggestions, support compound words,
fix various behavior differences between Java and C++ implementations, improve performance (Peter Gromov, Dawid Weiss)
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
(Dawid Weiss)

View File

@ -43,9 +43,9 @@ class GeneratingSuggester {
private static final int MAX_WORDS = 100;
private static final int MAX_GUESSES = 200;
private final Dictionary dictionary;
private final SpellChecker speller;
private final Hunspell speller;
GeneratingSuggester(SpellChecker speller) {
GeneratingSuggester(Hunspell speller) {
this.dictionary = speller.dictionary;
this.speller = speller;
}

View File

@ -34,15 +34,25 @@ import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
/**
* A spell checker based on Hunspell dictionaries. The objects of this class are not thread-safe
* (but a single underlying Dictionary can be shared by multiple spell-checkers in different
* threads). Not all Hunspell features are supported yet.
* A spell checker based on Hunspell dictionaries. This class can be used in place of native
* Hunspell for many languages for spell-checking and suggesting purposes. Note that not all
* languages are supported yet. For example:
*
* <ul>
* <li>Hungarian (as it doesn't only rely on dictionaries, but has some logic directly in the
* source code
* <li>Languages with Unicode characters outside of the Basic Multilingual Plane
* <li>PHONE affix file option for suggestions
* </ul>
*
* <p>The objects of this class are not thread-safe (but a single underlying Dictionary can be
* shared by multiple spell-checkers in different threads).
*/
public class SpellChecker {
public class Hunspell {
final Dictionary dictionary;
final Stemmer stemmer;
public SpellChecker(Dictionary dictionary) {
public Hunspell(Dictionary dictionary) {
this.dictionary = dictionary;
stemmer = new Stemmer(dictionary);
}
@ -448,8 +458,8 @@ public class SpellChecker {
}
}
SpellChecker suggestionSpeller =
new SpellChecker(dictionary) {
Hunspell suggestionSpeller =
new Hunspell(dictionary) {
@Override
boolean acceptsStem(int formID) {
return !dictionary.hasFlag(formID, dictionary.noSuggest)

View File

@ -28,10 +28,10 @@ class ModifyingSuggester {
private static final int MAX_CHAR_DISTANCE = 4;
private final LinkedHashSet<String> result = new LinkedHashSet<>();
private final char[] tryChars;
private final SpellChecker speller;
private final Hunspell speller;
boolean hasGoodSuggestions;
ModifyingSuggester(SpellChecker speller) {
ModifyingSuggester(Hunspell speller) {
this.speller = speller;
tryChars = speller.dictionary.tryChars.toCharArray();
}

View File

@ -17,13 +17,11 @@
/**
* A Java implementation of <a href="http://hunspell.github.io/">Hunspell</a> stemming and
* spell-checking algorithms, and a stemming TokenFilter based on it.
* spell-checking algorithms ({@link org.apache.lucene.analysis.hunspell.Hunspell}), and a stemming
* TokenFilter ({@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}) based on it.
*
* <p>For dictionaries, see e.g. <a href="https://github.com/LibreOffice/dictionaries">LibreOffice
* repository</a> or <a href="https://github.com/wooorm/dictionaries">Titus Wormer's collection
* (UTF)</a>
*
* @see org.apache.lucene.analysis.hunspell.HunspellStemFilter
* @see org.apache.lucene.analysis.hunspell.SpellChecker
*/
package org.apache.lucene.analysis.hunspell;

View File

@ -32,7 +32,7 @@ import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
/**
* Same as {@link SpellCheckerTest}, but checks all Hunspell's test data. The path to the checked
* Same as {@link TestSpellChecking}, but checks all Hunspell's test data. The path to the checked
* out Hunspell repository should be in {@code hunspell.repo.path} system property.
*/
@RunWith(Parameterized.class)
@ -78,7 +78,7 @@ public class TestHunspellRepositoryTestCases {
@Test
public void test() throws Throwable {
ThrowingRunnable test = () -> SpellCheckerTest.checkSpellCheckerExpectations(pathPrefix);
ThrowingRunnable test = () -> TestSpellChecking.checkSpellCheckerExpectations(pathPrefix);
if (EXPECTED_FAILURES.contains(testName)) {
Assert.assertThrows(Throwable.class, test);
} else {

View File

@ -76,7 +76,7 @@ public class TestPerformance extends LuceneTestCase {
List<String> words = loadWords(code, wordCount, dictionary);
Stemmer stemmer = new Stemmer(dictionary);
SpellChecker speller = new SpellChecker(dictionary);
Hunspell speller = new Hunspell(dictionary);
measure(
"Stemming " + code,
blackHole -> {

View File

@ -26,7 +26,7 @@ import java.util.stream.Collectors;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.util.IOUtils;
public class SpellCheckerTest extends StemmerTestBase {
public class TestSpellChecking extends StemmerTestBase {
public void testBase() throws Exception {
doTest("base");
@ -221,11 +221,11 @@ public class SpellCheckerTest extends StemmerTestBase {
InputStream affixStream = Files.newInputStream(Path.of(basePath.toString() + ".aff"));
InputStream dictStream = Files.newInputStream(Path.of(basePath.toString() + ".dic"));
SpellChecker speller;
Hunspell speller;
try {
Dictionary dictionary =
new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
speller = new SpellChecker(dictionary);
speller = new Hunspell(dictionary);
} finally {
IOUtils.closeWhileHandlingException(affixStream);
IOUtils.closeWhileHandlingException(dictStream);