mirror of https://github.com/apache/lucene.git
LUCENE-9727: build side support for running Hunspell tests. (#2313)
This commit is contained in:
parent
1cc26b6bb4
commit
903782d756
|
@ -91,10 +91,12 @@ grant {
|
|||
// allows LuceneTestCase#runWithRestrictedPermissions to execute with lower (or no) permission
|
||||
permission java.security.SecurityPermission "createAccessControlContext";
|
||||
|
||||
// Some Hunspell tests may read from external files specified in system properties
|
||||
// Hunspell regression and validation tests can read from external files
|
||||
// specified in system properties.
|
||||
permission java.io.FilePermission "${hunspell.repo.path}${/}-", "read";
|
||||
permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read";
|
||||
permission java.io.FilePermission "${hunspell.corpora}${/}-", "read";
|
||||
permission java.io.FilePermission "${hunspell.dictionaries}", "read";
|
||||
permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read";
|
||||
};
|
||||
|
||||
// Permissions to support ant build
|
||||
|
|
|
@ -23,3 +23,18 @@ dependencies {
|
|||
api project(':lucene:core')
|
||||
testImplementation project(':lucene:test-framework')
|
||||
}
|
||||
|
||||
// Pass all hunspell-tests-specific project properties to tests as system properties.
|
||||
tasks.withType(Test) {
|
||||
[
|
||||
"hunspell.dictionaries",
|
||||
"hunspell.corpora",
|
||||
"hunspell.repo.path"
|
||||
].each {
|
||||
def val = propertyOrDefault(it, null)
|
||||
if (val != null) {
|
||||
logger.lifecycle("Passing property: ${it}=${val}")
|
||||
systemProperty it, val
|
||||
}
|
||||
}
|
||||
}
|
|
@ -16,35 +16,52 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.text.ParseException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.lucene.store.BaseDirectoryWrapper;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
|
||||
import org.apache.lucene.util.NamedThreadFactory;
|
||||
import org.apache.lucene.util.RamUsageTester;
|
||||
import org.junit.Assume;
|
||||
import org.junit.Ignore;
|
||||
|
||||
/**
|
||||
* Loads all dictionaries from the directory specified in {@code -Dhunspell.dictionaries=...} and
|
||||
* prints their memory usage. All *.aff files are traversed directly inside the given directory or
|
||||
* in its immediate subdirectories. Each *.aff file must have a same-named sibling *.dic file. For
|
||||
* examples of such directories, refer to the {@link org.apache.lucene.analysis.hunspell package
|
||||
* documentation}
|
||||
* Loads all dictionaries from the directory specified in {@code hunspell.dictionaries} system
|
||||
* property and prints their memory usage. All *.aff files are traversed recursively inside the
|
||||
* given directory. Each *.aff file must have a same-named sibling *.dic file. For examples of such
|
||||
* directories, refer to the {@link org.apache.lucene.analysis.hunspell package documentation}.
|
||||
*/
|
||||
@Ignore("enable manually")
|
||||
@SuppressSysoutChecks(bugUrl = "prints important memory utilization stats per dictionary")
|
||||
public class TestAllDictionaries extends LuceneTestCase {
|
||||
|
||||
static Stream<Path> findAllAffixFiles() throws IOException {
|
||||
String dicDir = System.getProperty("hunspell.dictionaries");
|
||||
Assume.assumeFalse("Missing -Dhunspell.dictionaries=...", dicDir == null);
|
||||
return Files.walk(Path.of(dicDir), 2).filter(f -> f.toString().endsWith(".aff"));
|
||||
Assume.assumeFalse(
|
||||
"Requires Hunspell dictionaries at -Dhunspell.dictionaries=...", dicDir == null);
|
||||
Path dicPath = Paths.get(dicDir);
|
||||
return Files.walk(dicPath).filter(f -> f.toString().endsWith(".aff")).sorted();
|
||||
}
|
||||
|
||||
static Dictionary loadDictionary(Path aff) throws IOException, ParseException {
|
||||
|
@ -58,43 +75,121 @@ public class TestAllDictionaries extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testDictionariesLoadSuccessfully() throws Exception {
|
||||
int failures = 0;
|
||||
for (Path aff : findAllAffixFiles().collect(Collectors.toList())) {
|
||||
try {
|
||||
System.out.println(aff + "\t" + memoryUsage(loadDictionary(aff)));
|
||||
} catch (Throwable e) {
|
||||
failures++;
|
||||
System.err.println("While checking " + aff + ":");
|
||||
e.printStackTrace();
|
||||
}
|
||||
/** Hack bais to expose current position. */
|
||||
private static class ExposePosition extends ByteArrayInputStream {
|
||||
public ExposePosition(byte[] buf) {
|
||||
super(buf);
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return super.pos;
|
||||
}
|
||||
}
|
||||
|
||||
@Ignore
|
||||
public void testMaxPrologueNeeded() throws Exception {
|
||||
AtomicBoolean failTest = new AtomicBoolean();
|
||||
|
||||
Map<String, List<Long>> global = new LinkedHashMap<>();
|
||||
for (Path aff : findAllAffixFiles().collect(Collectors.toList())) {
|
||||
Map<String, List<Long>> local = new LinkedHashMap<>();
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
try (ExposePosition is = new ExposePosition(Files.readAllBytes(aff))) {
|
||||
int chr;
|
||||
while ((chr = is.read()) >= 0) {
|
||||
baos.write(chr);
|
||||
|
||||
if (chr == '\n') {
|
||||
String line = baos.toString(StandardCharsets.ISO_8859_1);
|
||||
if (!line.isBlank()) {
|
||||
String firstWord = line.split("\\s")[0];
|
||||
switch (firstWord) {
|
||||
case "SET":
|
||||
case "FLAG":
|
||||
local.computeIfAbsent(firstWord, (k) -> new ArrayList<>()).add(is.position());
|
||||
global.computeIfAbsent(firstWord, (k) -> new ArrayList<>()).add(is.position());
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
baos.reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
local.forEach(
|
||||
(flag, positions) -> {
|
||||
if (positions.size() > 1) {
|
||||
System.out.format(
|
||||
Locale.ROOT,
|
||||
"Flag %s at more than one position in %s: %s%n",
|
||||
flag,
|
||||
aff,
|
||||
positions);
|
||||
failTest.set(true);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
global.forEach(
|
||||
(flag, positions) -> {
|
||||
long max = positions.stream().mapToLong(v -> v).max().orElse(0);
|
||||
System.out.printf(Locale.ROOT, "Flag %s at maximum offset %s%n", flag, max);
|
||||
});
|
||||
|
||||
if (failTest.get()) {
|
||||
throw new AssertionError("Duplicate flags were present in at least one .aff file.");
|
||||
}
|
||||
}
|
||||
|
||||
public void testDictionariesLoadSuccessfully() throws Exception {
|
||||
int threads = Runtime.getRuntime().availableProcessors();
|
||||
ExecutorService executor =
|
||||
Executors.newFixedThreadPool(threads, new NamedThreadFactory("dictCheck-"));
|
||||
try {
|
||||
List<Path> failures = Collections.synchronizedList(new ArrayList<>());
|
||||
Function<Path, Void> process =
|
||||
(Path aff) -> {
|
||||
try {
|
||||
System.out.println(aff + "\t" + memoryUsage(loadDictionary(aff)));
|
||||
} catch (Throwable e) {
|
||||
failures.add(aff);
|
||||
System.err.println("While checking " + aff + ":");
|
||||
e.printStackTrace();
|
||||
}
|
||||
return null;
|
||||
};
|
||||
|
||||
for (Future<?> future :
|
||||
executor.invokeAll(
|
||||
findAllAffixFiles()
|
||||
.map(aff -> (Callable<?>) () -> process.apply(aff))
|
||||
.collect(Collectors.toList()))) {
|
||||
future.get();
|
||||
}
|
||||
|
||||
if (!failures.isEmpty()) {
|
||||
throw new AssertionError(
|
||||
"Certain dictionaries failed to parse:\n - "
|
||||
+ failures.stream()
|
||||
.map(path -> path.toAbsolutePath().toString())
|
||||
.collect(Collectors.joining("\n - ")));
|
||||
}
|
||||
} finally {
|
||||
executor.shutdown();
|
||||
executor.awaitTermination(1, TimeUnit.MINUTES);
|
||||
}
|
||||
assertEquals(failures + " failures!", 0, failures);
|
||||
}
|
||||
|
||||
private static String memoryUsage(Dictionary dic) {
|
||||
return RamUsageTester.humanSizeOf(dic)
|
||||
+ "\t("
|
||||
+ "words="
|
||||
+ RamUsageTester.humanSizeOf(dic.words)
|
||||
+ ", "
|
||||
+ "flags="
|
||||
+ RamUsageTester.humanSizeOf(dic.flagLookup)
|
||||
+ ", "
|
||||
+ "strips="
|
||||
+ RamUsageTester.humanSizeOf(dic.stripData)
|
||||
+ ", "
|
||||
+ "conditions="
|
||||
+ RamUsageTester.humanSizeOf(dic.patterns)
|
||||
+ ", "
|
||||
+ "affixData="
|
||||
+ RamUsageTester.humanSizeOf(dic.affixData)
|
||||
+ ", "
|
||||
+ "prefixes="
|
||||
+ RamUsageTester.humanSizeOf(dic.prefixes)
|
||||
+ ", "
|
||||
+ "suffixes="
|
||||
+ RamUsageTester.humanSizeOf(dic.suffixes)
|
||||
+ ")";
|
||||
+ ("words=" + RamUsageTester.humanSizeOf(dic.words) + ", ")
|
||||
+ ("flags=" + RamUsageTester.humanSizeOf(dic.flagLookup) + ", ")
|
||||
+ ("strips=" + RamUsageTester.humanSizeOf(dic.stripData) + ", ")
|
||||
+ ("conditions=" + RamUsageTester.humanSizeOf(dic.patterns) + ", ")
|
||||
+ ("affixData=" + RamUsageTester.humanSizeOf(dic.affixData) + ", ")
|
||||
+ ("prefixes=" + RamUsageTester.humanSizeOf(dic.prefixes) + ", ")
|
||||
+ ("suffixes=" + RamUsageTester.humanSizeOf(dic.suffixes) + ")");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -32,7 +32,7 @@ import org.junit.runners.Parameterized;
|
|||
|
||||
/**
|
||||
* Same as {@link SpellCheckerTest}, but checks all Hunspell's test data. The path to the checked
|
||||
* out Hunspell repository should be in {@code -Dhunspell.repo.path=...} system property.
|
||||
* out Hunspell repository should be in {@code hunspell.repo.path} system property.
|
||||
*/
|
||||
@RunWith(Parameterized.class)
|
||||
public class TestHunspellRepositoryTestCases {
|
||||
|
|
|
@ -24,13 +24,15 @@ import java.io.InputStreamReader;
|
|||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.regex.Pattern;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Assume;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.AssumptionViolatedException;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
|
@ -40,8 +42,15 @@ import org.junit.Test;
|
|||
* en.txt}) in a directory specified in {@code -Dhunspell.corpora=...}
|
||||
*/
|
||||
@TestCaseOrdering(TestCaseOrdering.AlphabeticOrder.class)
|
||||
@Ignore("enable manually")
|
||||
public class TestPerformance extends LuceneTestCase {
|
||||
private static Path corporaDir;
|
||||
|
||||
@BeforeClass
|
||||
public static void resolveCorpora() {
|
||||
String dir = System.getProperty("hunspell.corpora");
|
||||
Assume.assumeFalse("Requires test word corpora at -Dhunspell.corpora=...", dir == null);
|
||||
corporaDir = Paths.get(dir);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void en() throws Exception {
|
||||
|
@ -60,6 +69,7 @@ public class TestPerformance extends LuceneTestCase {
|
|||
|
||||
private void checkPerformance(String code, int wordCount) throws Exception {
|
||||
Path aff = findAffFile(code);
|
||||
|
||||
Dictionary dictionary = TestAllDictionaries.loadDictionary(aff);
|
||||
System.out.println("Loaded " + aff);
|
||||
|
||||
|
@ -92,15 +102,17 @@ public class TestPerformance extends LuceneTestCase {
|
|||
return code.equals(Dictionary.extractLanguageCode(parentName));
|
||||
})
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new IllegalArgumentException("Cannot find aff/dic for " + code));
|
||||
.orElseThrow(
|
||||
() -> new AssumptionViolatedException("Ignored, cannot find aff/dic for: " + code));
|
||||
}
|
||||
|
||||
private List<String> loadWords(String code, int wordCount, Dictionary dictionary)
|
||||
throws IOException {
|
||||
String corpusDir = System.getProperty("hunspell.corpora");
|
||||
Assume.assumeFalse("", corpusDir == null);
|
||||
Path dataPath = corporaDir.resolve(code + ".txt");
|
||||
if (!Files.isReadable(dataPath)) {
|
||||
throw new AssumptionViolatedException("Missing text corpora at: " + dataPath);
|
||||
}
|
||||
|
||||
Path dataPath = Path.of(corpusDir).resolve(code + ".txt");
|
||||
List<String> words = new ArrayList<>();
|
||||
try (InputStream stream = Files.newInputStream(dataPath)) {
|
||||
BufferedReader reader =
|
||||
|
|
|
@ -20,6 +20,7 @@ import java.lang.reflect.Array;
|
|||
import java.lang.reflect.Field;
|
||||
import java.lang.reflect.Method;
|
||||
import java.lang.reflect.Modifier;
|
||||
import java.security.AccessControlException;
|
||||
import java.security.AccessController;
|
||||
import java.security.PrivilegedAction;
|
||||
import java.text.DecimalFormat;
|
||||
|
@ -527,14 +528,14 @@ public final class RamUsageEstimator {
|
|||
// Walk type hierarchy
|
||||
for (; clazz != null; clazz = clazz.getSuperclass()) {
|
||||
final Class<?> target = clazz;
|
||||
final Field[] fields =
|
||||
AccessController.doPrivileged(
|
||||
new PrivilegedAction<Field[]>() {
|
||||
@Override
|
||||
public Field[] run() {
|
||||
return target.getDeclaredFields();
|
||||
}
|
||||
});
|
||||
final Field[] fields;
|
||||
try {
|
||||
fields =
|
||||
AccessController.doPrivileged((PrivilegedAction<Field[]>) target::getDeclaredFields);
|
||||
} catch (AccessControlException e) {
|
||||
throw new RuntimeException("Can't access fields of class: " + target, e);
|
||||
}
|
||||
|
||||
for (Field f : fields) {
|
||||
if (!Modifier.isStatic(f.getModifiers())) {
|
||||
size = adjustForField(size, f);
|
||||
|
|
|
@ -23,11 +23,14 @@ import java.lang.reflect.Field;
|
|||
import java.lang.reflect.Modifier;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.file.Path;
|
||||
import java.security.AccessController;
|
||||
import java.security.PrivilegedAction;
|
||||
import java.util.AbstractList;
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
|
@ -151,6 +154,14 @@ public final class RamUsageTester {
|
|||
ArrayList<Object> stack,
|
||||
Object ob,
|
||||
Class<?> obClazz) {
|
||||
|
||||
// Ignore JDK objects we can't access or handle properly.
|
||||
Predicate<Object> isIgnorable =
|
||||
(clazz) -> (clazz instanceof CharsetEncoder) || (clazz instanceof CharsetDecoder);
|
||||
if (isIgnorable.test(ob)) {
|
||||
return accumulator.accumulateObject(ob, 0, Collections.emptyMap(), stack);
|
||||
}
|
||||
|
||||
/*
|
||||
* Consider an object. Push any references it has to the processing stack
|
||||
* and accumulate this object's shallow size.
|
||||
|
@ -159,10 +170,7 @@ public final class RamUsageTester {
|
|||
if (Constants.JRE_IS_MINIMUM_JAVA9) {
|
||||
long alignedShallowInstanceSize = RamUsageEstimator.shallowSizeOf(ob);
|
||||
|
||||
Predicate<Class<?>> isJavaModule =
|
||||
(clazz) -> {
|
||||
return clazz.getName().startsWith("java.");
|
||||
};
|
||||
Predicate<Class<?>> isJavaModule = (clazz) -> clazz.getName().startsWith("java.");
|
||||
|
||||
// Java 9: Best guess for some known types, as we cannot precisely look into runtime
|
||||
// classes:
|
||||
|
@ -274,13 +282,17 @@ public final class RamUsageTester {
|
|||
v.length())); // may not be correct with Java 9's compact strings!
|
||||
a(StringBuilder.class, v -> charArraySize(v.capacity()));
|
||||
a(StringBuffer.class, v -> charArraySize(v.capacity()));
|
||||
// Approximate the underlying long[] buffer.
|
||||
a(BitSet.class, v -> (v.size() / Byte.SIZE));
|
||||
// Types with large buffers:
|
||||
a(ByteArrayOutputStream.class, v -> byteArraySize(v.size()));
|
||||
// For File and Path, we just take the length of String representation as
|
||||
// approximation:
|
||||
a(File.class, v -> charArraySize(v.toString().length()));
|
||||
a(Path.class, v -> charArraySize(v.toString().length()));
|
||||
a(ByteOrder.class, v -> 0); // Instances of ByteOrder are constants
|
||||
|
||||
// Ignorable JDK classes.
|
||||
a(ByteOrder.class, v -> 0);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
|
|
Loading…
Reference in New Issue