LUCENE-9727: build side support for running Hunspell tests. (#2313)

This commit is contained in:
Dawid Weiss 2021-02-08 10:50:25 +01:00 committed by GitHub
parent 1cc26b6bb4
commit 903782d756
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 200 additions and 63 deletions

View File

@ -91,10 +91,12 @@ grant {
// allows LuceneTestCase#runWithRestrictedPermissions to execute with lower (or no) permission
permission java.security.SecurityPermission "createAccessControlContext";
// Some Hunspell tests may read from external files specified in system properties
// Hunspell regression and validation tests can read from external files
// specified in system properties.
permission java.io.FilePermission "${hunspell.repo.path}${/}-", "read";
permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read";
permission java.io.FilePermission "${hunspell.corpora}${/}-", "read";
permission java.io.FilePermission "${hunspell.dictionaries}", "read";
permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read";
};
// Permissions to support ant build

View File

@ -23,3 +23,18 @@ dependencies {
api project(':lucene:core')
testImplementation project(':lucene:test-framework')
}
// Pass all hunspell-tests-specific project properties to tests as system properties.
tasks.withType(Test) {
[
"hunspell.dictionaries",
"hunspell.corpora",
"hunspell.repo.path"
].each {
def val = propertyOrDefault(it, null)
if (val != null) {
logger.lifecycle("Passing property: ${it}=${val}")
systemProperty it, val
}
}
}

View File

@ -16,35 +16,52 @@
*/
package org.apache.lucene.analysis.hunspell;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.store.BaseDirectoryWrapper;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
import org.apache.lucene.util.NamedThreadFactory;
import org.apache.lucene.util.RamUsageTester;
import org.junit.Assume;
import org.junit.Ignore;
/**
* Loads all dictionaries from the directory specified in {@code -Dhunspell.dictionaries=...} and
* prints their memory usage. All *.aff files are traversed directly inside the given directory or
* in its immediate subdirectories. Each *.aff file must have a same-named sibling *.dic file. For
* examples of such directories, refer to the {@link org.apache.lucene.analysis.hunspell package
* documentation}
* Loads all dictionaries from the directory specified in {@code hunspell.dictionaries} system
* property and prints their memory usage. All *.aff files are traversed recursively inside the
* given directory. Each *.aff file must have a same-named sibling *.dic file. For examples of such
* directories, refer to the {@link org.apache.lucene.analysis.hunspell package documentation}.
*/
@Ignore("enable manually")
@SuppressSysoutChecks(bugUrl = "prints important memory utilization stats per dictionary")
public class TestAllDictionaries extends LuceneTestCase {
static Stream<Path> findAllAffixFiles() throws IOException {
String dicDir = System.getProperty("hunspell.dictionaries");
Assume.assumeFalse("Missing -Dhunspell.dictionaries=...", dicDir == null);
return Files.walk(Path.of(dicDir), 2).filter(f -> f.toString().endsWith(".aff"));
Assume.assumeFalse(
"Requires Hunspell dictionaries at -Dhunspell.dictionaries=...", dicDir == null);
Path dicPath = Paths.get(dicDir);
return Files.walk(dicPath).filter(f -> f.toString().endsWith(".aff")).sorted();
}
static Dictionary loadDictionary(Path aff) throws IOException, ParseException {
@ -58,43 +75,121 @@ public class TestAllDictionaries extends LuceneTestCase {
}
}
public void testDictionariesLoadSuccessfully() throws Exception {
int failures = 0;
/** Hack bais to expose current position. */
private static class ExposePosition extends ByteArrayInputStream {
public ExposePosition(byte[] buf) {
super(buf);
}
public long position() {
return super.pos;
}
}
@Ignore
public void testMaxPrologueNeeded() throws Exception {
AtomicBoolean failTest = new AtomicBoolean();
Map<String, List<Long>> global = new LinkedHashMap<>();
for (Path aff : findAllAffixFiles().collect(Collectors.toList())) {
Map<String, List<Long>> local = new LinkedHashMap<>();
ByteArrayOutputStream baos = new ByteArrayOutputStream();
try (ExposePosition is = new ExposePosition(Files.readAllBytes(aff))) {
int chr;
while ((chr = is.read()) >= 0) {
baos.write(chr);
if (chr == '\n') {
String line = baos.toString(StandardCharsets.ISO_8859_1);
if (!line.isBlank()) {
String firstWord = line.split("\\s")[0];
switch (firstWord) {
case "SET":
case "FLAG":
local.computeIfAbsent(firstWord, (k) -> new ArrayList<>()).add(is.position());
global.computeIfAbsent(firstWord, (k) -> new ArrayList<>()).add(is.position());
break;
}
}
baos.reset();
}
}
}
local.forEach(
(flag, positions) -> {
if (positions.size() > 1) {
System.out.format(
Locale.ROOT,
"Flag %s at more than one position in %s: %s%n",
flag,
aff,
positions);
failTest.set(true);
}
});
}
global.forEach(
(flag, positions) -> {
long max = positions.stream().mapToLong(v -> v).max().orElse(0);
System.out.printf(Locale.ROOT, "Flag %s at maximum offset %s%n", flag, max);
});
if (failTest.get()) {
throw new AssertionError("Duplicate flags were present in at least one .aff file.");
}
}
public void testDictionariesLoadSuccessfully() throws Exception {
int threads = Runtime.getRuntime().availableProcessors();
ExecutorService executor =
Executors.newFixedThreadPool(threads, new NamedThreadFactory("dictCheck-"));
try {
List<Path> failures = Collections.synchronizedList(new ArrayList<>());
Function<Path, Void> process =
(Path aff) -> {
try {
System.out.println(aff + "\t" + memoryUsage(loadDictionary(aff)));
} catch (Throwable e) {
failures++;
failures.add(aff);
System.err.println("While checking " + aff + ":");
e.printStackTrace();
}
return null;
};
for (Future<?> future :
executor.invokeAll(
findAllAffixFiles()
.map(aff -> (Callable<?>) () -> process.apply(aff))
.collect(Collectors.toList()))) {
future.get();
}
if (!failures.isEmpty()) {
throw new AssertionError(
"Certain dictionaries failed to parse:\n - "
+ failures.stream()
.map(path -> path.toAbsolutePath().toString())
.collect(Collectors.joining("\n - ")));
}
} finally {
executor.shutdown();
executor.awaitTermination(1, TimeUnit.MINUTES);
}
assertEquals(failures + " failures!", 0, failures);
}
private static String memoryUsage(Dictionary dic) {
return RamUsageTester.humanSizeOf(dic)
+ "\t("
+ "words="
+ RamUsageTester.humanSizeOf(dic.words)
+ ", "
+ "flags="
+ RamUsageTester.humanSizeOf(dic.flagLookup)
+ ", "
+ "strips="
+ RamUsageTester.humanSizeOf(dic.stripData)
+ ", "
+ "conditions="
+ RamUsageTester.humanSizeOf(dic.patterns)
+ ", "
+ "affixData="
+ RamUsageTester.humanSizeOf(dic.affixData)
+ ", "
+ "prefixes="
+ RamUsageTester.humanSizeOf(dic.prefixes)
+ ", "
+ "suffixes="
+ RamUsageTester.humanSizeOf(dic.suffixes)
+ ")";
+ ("words=" + RamUsageTester.humanSizeOf(dic.words) + ", ")
+ ("flags=" + RamUsageTester.humanSizeOf(dic.flagLookup) + ", ")
+ ("strips=" + RamUsageTester.humanSizeOf(dic.stripData) + ", ")
+ ("conditions=" + RamUsageTester.humanSizeOf(dic.patterns) + ", ")
+ ("affixData=" + RamUsageTester.humanSizeOf(dic.affixData) + ", ")
+ ("prefixes=" + RamUsageTester.humanSizeOf(dic.prefixes) + ", ")
+ ("suffixes=" + RamUsageTester.humanSizeOf(dic.suffixes) + ")");
}
}

View File

@ -32,7 +32,7 @@ import org.junit.runners.Parameterized;
/**
* Same as {@link SpellCheckerTest}, but checks all Hunspell's test data. The path to the checked
* out Hunspell repository should be in {@code -Dhunspell.repo.path=...} system property.
* out Hunspell repository should be in {@code hunspell.repo.path} system property.
*/
@RunWith(Parameterized.class)
public class TestHunspellRepositoryTestCases {

View File

@ -24,13 +24,15 @@ import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Consumer;
import java.util.regex.Pattern;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Assume;
import org.junit.Ignore;
import org.junit.AssumptionViolatedException;
import org.junit.BeforeClass;
import org.junit.Test;
/**
@ -40,8 +42,15 @@ import org.junit.Test;
* en.txt}) in a directory specified in {@code -Dhunspell.corpora=...}
*/
@TestCaseOrdering(TestCaseOrdering.AlphabeticOrder.class)
@Ignore("enable manually")
public class TestPerformance extends LuceneTestCase {
private static Path corporaDir;
@BeforeClass
public static void resolveCorpora() {
String dir = System.getProperty("hunspell.corpora");
Assume.assumeFalse("Requires test word corpora at -Dhunspell.corpora=...", dir == null);
corporaDir = Paths.get(dir);
}
@Test
public void en() throws Exception {
@ -60,6 +69,7 @@ public class TestPerformance extends LuceneTestCase {
private void checkPerformance(String code, int wordCount) throws Exception {
Path aff = findAffFile(code);
Dictionary dictionary = TestAllDictionaries.loadDictionary(aff);
System.out.println("Loaded " + aff);
@ -92,15 +102,17 @@ public class TestPerformance extends LuceneTestCase {
return code.equals(Dictionary.extractLanguageCode(parentName));
})
.findFirst()
.orElseThrow(() -> new IllegalArgumentException("Cannot find aff/dic for " + code));
.orElseThrow(
() -> new AssumptionViolatedException("Ignored, cannot find aff/dic for: " + code));
}
private List<String> loadWords(String code, int wordCount, Dictionary dictionary)
throws IOException {
String corpusDir = System.getProperty("hunspell.corpora");
Assume.assumeFalse("", corpusDir == null);
Path dataPath = corporaDir.resolve(code + ".txt");
if (!Files.isReadable(dataPath)) {
throw new AssumptionViolatedException("Missing text corpora at: " + dataPath);
}
Path dataPath = Path.of(corpusDir).resolve(code + ".txt");
List<String> words = new ArrayList<>();
try (InputStream stream = Files.newInputStream(dataPath)) {
BufferedReader reader =

View File

@ -20,6 +20,7 @@ import java.lang.reflect.Array;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.lang.reflect.Modifier;
import java.security.AccessControlException;
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.text.DecimalFormat;
@ -527,14 +528,14 @@ public final class RamUsageEstimator {
// Walk type hierarchy
for (; clazz != null; clazz = clazz.getSuperclass()) {
final Class<?> target = clazz;
final Field[] fields =
AccessController.doPrivileged(
new PrivilegedAction<Field[]>() {
@Override
public Field[] run() {
return target.getDeclaredFields();
final Field[] fields;
try {
fields =
AccessController.doPrivileged((PrivilegedAction<Field[]>) target::getDeclaredFields);
} catch (AccessControlException e) {
throw new RuntimeException("Can't access fields of class: " + target, e);
}
});
for (Field f : fields) {
if (!Modifier.isStatic(f.getModifiers())) {
size = adjustForField(size, f);

View File

@ -23,11 +23,14 @@ import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.file.Path;
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
@ -151,6 +154,14 @@ public final class RamUsageTester {
ArrayList<Object> stack,
Object ob,
Class<?> obClazz) {
// Ignore JDK objects we can't access or handle properly.
Predicate<Object> isIgnorable =
(clazz) -> (clazz instanceof CharsetEncoder) || (clazz instanceof CharsetDecoder);
if (isIgnorable.test(ob)) {
return accumulator.accumulateObject(ob, 0, Collections.emptyMap(), stack);
}
/*
* Consider an object. Push any references it has to the processing stack
* and accumulate this object's shallow size.
@ -159,10 +170,7 @@ public final class RamUsageTester {
if (Constants.JRE_IS_MINIMUM_JAVA9) {
long alignedShallowInstanceSize = RamUsageEstimator.shallowSizeOf(ob);
Predicate<Class<?>> isJavaModule =
(clazz) -> {
return clazz.getName().startsWith("java.");
};
Predicate<Class<?>> isJavaModule = (clazz) -> clazz.getName().startsWith("java.");
// Java 9: Best guess for some known types, as we cannot precisely look into runtime
// classes:
@ -274,13 +282,17 @@ public final class RamUsageTester {
v.length())); // may not be correct with Java 9's compact strings!
a(StringBuilder.class, v -> charArraySize(v.capacity()));
a(StringBuffer.class, v -> charArraySize(v.capacity()));
// Approximate the underlying long[] buffer.
a(BitSet.class, v -> (v.size() / Byte.SIZE));
// Types with large buffers:
a(ByteArrayOutputStream.class, v -> byteArraySize(v.size()));
// For File and Path, we just take the length of String representation as
// approximation:
a(File.class, v -> charArraySize(v.toString().length()));
a(Path.class, v -> charArraySize(v.toString().length()));
a(ByteOrder.class, v -> 0); // Instances of ByteOrder are constants
// Ignorable JDK classes.
a(ByteOrder.class, v -> 0);
}
@SuppressWarnings("unchecked")