LUCENE-9707: Hunspell: check Lucene's implementation against Hunspel's test data (#2267)

This commit is contained in:
Peter Gromov 2021-02-02 10:46:14 +01:00 committed by GitHub
parent 2da7a4a86d
commit b48d5beb34
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 112 additions and 51 deletions

View File

@ -90,6 +90,9 @@ grant {
// allows LuceneTestCase#runWithRestrictedPermissions to execute with lower (or no) permission // allows LuceneTestCase#runWithRestrictedPermissions to execute with lower (or no) permission
permission java.security.SecurityPermission "createAccessControlContext"; permission java.security.SecurityPermission "createAccessControlContext";
// Some Hunspell tests may read from external files specified in system properties
permission java.io.FilePermission "${hunspell.repo.path}${/}-", "read";
}; };
// Permissions to support ant build // Permissions to support ant build

View File

@ -16,35 +16,31 @@
*/ */
package org.apache.lucene.analysis.hunspell; package org.apache.lucene.analysis.hunspell;
import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.net.URL;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.text.ParseException;
import java.util.List; import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.lucene.store.ByteBuffersDirectory; import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.junit.Test;
public class SpellCheckerTest extends StemmerTestBase { public class SpellCheckerTest extends StemmerTestBase {
@Test
public void base() throws Exception { public void testBase() throws Exception {
doTest("base"); doTest("base");
} }
@Test public void testBaseUtf() throws Exception {
public void baseUtf() throws Exception {
doTest("base_utf"); doTest("base_utf");
} }
@Test public void testKeepcase() throws Exception {
public void keepcase() throws Exception {
doTest("keepcase"); doTest("keepcase");
} }
@Test public void testAllcaps() throws Exception {
public void allcaps() throws Exception {
doTest("allcaps"); doTest("allcaps");
} }
@ -52,63 +48,51 @@ public class SpellCheckerTest extends StemmerTestBase {
doTest("rep"); doTest("rep");
} }
@Test public void testForceUCase() throws Exception {
public void forceUCase() throws Exception {
doTest("forceucase"); doTest("forceucase");
} }
@Test public void testCheckSharpS() throws Exception {
public void checkSharpS() throws Exception {
doTest("checksharps"); doTest("checksharps");
} }
@Test public void testIJ() throws Exception {
public void IJ() throws Exception {
doTest("IJ"); doTest("IJ");
} }
@Test public void testI53643_numbersWithSeparators() throws Exception {
public void i53643_numbersWithSeparators() throws Exception {
doTest("i53643"); doTest("i53643");
} }
@Test public void testDotless_i() throws Exception {
public void dotless_i() throws Exception {
doTest("dotless_i"); doTest("dotless_i");
} }
@Test public void testNeedAffixOnAffixes() throws Exception {
public void needAffixOnAffixes() throws Exception {
doTest("needaffix5"); doTest("needaffix5");
} }
@Test public void testCompoundFlag() throws Exception {
public void compoundFlag() throws Exception {
doTest("compoundflag"); doTest("compoundflag");
} }
@Test public void testCheckCompoundCase() throws Exception {
public void checkCompoundCase() throws Exception {
doTest("checkcompoundcase"); doTest("checkcompoundcase");
} }
@Test public void testCheckCompoundDup() throws Exception {
public void checkCompoundDup() throws Exception {
doTest("checkcompounddup"); doTest("checkcompounddup");
} }
@Test public void testCheckCompoundTriple() throws Exception {
public void checkCompoundTriple() throws Exception {
doTest("checkcompoundtriple"); doTest("checkcompoundtriple");
} }
@Test public void testSimplifiedTriple() throws Exception {
public void simplifiedTriple() throws Exception {
doTest("simplifiedtriple"); doTest("simplifiedtriple");
} }
@Test public void testCompoundForbid() throws Exception {
public void compoundForbid() throws Exception {
doTest("compoundforbid"); doTest("compoundforbid");
} }
@ -161,10 +145,14 @@ public class SpellCheckerTest extends StemmerTestBase {
} }
protected void doTest(String name) throws Exception { protected void doTest(String name) throws Exception {
InputStream affixStream = checkSpellCheckerExpectations(
Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name); Path.of(getClass().getResource(name + ".aff").toURI()).getParent().resolve(name), true);
InputStream dictStream = }
Objects.requireNonNull(getClass().getResourceAsStream(name + ".dic"), name);
static void checkSpellCheckerExpectations(Path basePath, boolean checkSuggestions)
throws IOException, ParseException {
InputStream affixStream = Files.newInputStream(Path.of(basePath.toString() + ".aff"));
InputStream dictStream = Files.newInputStream(Path.of(basePath.toString() + ".dic"));
SpellChecker speller; SpellChecker speller;
try { try {
@ -176,30 +164,30 @@ public class SpellCheckerTest extends StemmerTestBase {
IOUtils.closeWhileHandlingException(dictStream); IOUtils.closeWhileHandlingException(dictStream);
} }
URL good = StemmerTestBase.class.getResource(name + ".good"); Path good = Path.of(basePath + ".good");
if (good != null) { if (Files.exists(good)) {
for (String word : Files.readAllLines(Path.of(good.toURI()))) { for (String word : Files.readAllLines(good)) {
assertTrue("Unexpectedly considered misspelled: " + word, speller.spell(word)); assertTrue("Unexpectedly considered misspelled: " + word, speller.spell(word.trim()));
} }
} }
URL wrong = StemmerTestBase.class.getResource(name + ".wrong"); Path wrong = Path.of(basePath + ".wrong");
URL sug = StemmerTestBase.class.getResource(name + ".sug"); Path sug = Path.of(basePath + ".sug");
if (wrong != null) { if (Files.exists(wrong)) {
List<String> wrongWords = Files.readAllLines(Path.of(wrong.toURI())); List<String> wrongWords = Files.readAllLines(wrong);
for (String word : wrongWords) { for (String word : wrongWords) {
assertFalse("Unexpectedly considered correct: " + word, speller.spell(word)); assertFalse("Unexpectedly considered correct: " + word, speller.spell(word.trim()));
} }
if (sug != null) { if (Files.exists(sug) && checkSuggestions) {
String suggestions = String suggestions =
wrongWords.stream() wrongWords.stream()
.map(s -> String.join(", ", speller.suggest(s))) .map(s -> String.join(", ", speller.suggest(s)))
.filter(s -> !s.isEmpty()) .filter(s -> !s.isEmpty())
.collect(Collectors.joining("\n")); .collect(Collectors.joining("\n"));
assertEquals(Files.readString(Path.of(sug.toURI())).trim(), suggestions); assertEquals(Files.readString(sug).trim(), suggestions);
} }
} else { } else {
assertNull(".sug file without .wrong file!", sug); assertFalse(".sug file without .wrong file!", Files.exists(sug));
} }
} }
} }

View File

@ -0,0 +1,70 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import java.io.IOException;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.ParseException;
import java.util.Collection;
import java.util.Set;
import java.util.TreeSet;
import java.util.stream.Collectors;
import org.junit.AssumptionViolatedException;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
/**
* Same as {@link SpellCheckerTest}, but checks all Hunspell's test data. The path to the checked
* out Hunspell repository should be in {@code -Dhunspell.repo.path=...} system property.
*/
@RunWith(Parameterized.class)
public class TestHunspellRepositoryTestCases {
private final Path pathPrefix;
public TestHunspellRepositoryTestCases(String testName, Path pathPrefix) {
this.pathPrefix = pathPrefix;
}
@Parameterized.Parameters(name = "{0}")
public static Collection<Object[]> data() throws IOException {
String hunspellRepo = System.getProperty("hunspell.repo.path");
if (hunspellRepo == null) {
throw new AssumptionViolatedException("hunspell.repo.path property not specified.");
}
Set<String> names = new TreeSet<>();
Path tests = Path.of(hunspellRepo).resolve("tests");
try (DirectoryStream<Path> files = Files.newDirectoryStream(tests)) {
for (Path file : files) {
String name = file.getFileName().toString();
if (name.endsWith(".aff")) {
names.add(name.substring(0, name.length() - 4));
}
}
}
return names.stream().map(s -> new Object[] {s, tests.resolve(s)}).collect(Collectors.toList());
}
@Test
public void test() throws IOException, ParseException {
SpellCheckerTest.checkSpellCheckerExpectations(pathPrefix, false);
}
}