From 903782d75656c63de018af6c102b14dd2cf2748d Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Mon, 8 Feb 2021 10:50:25 +0100 Subject: [PATCH] LUCENE-9727: build side support for running Hunspell tests. (#2313) --- .../randomization/policies/tests.policy | 6 +- lucene/analysis/common/build.gradle | 15 ++ .../hunspell/TestAllDictionaries.java | 177 ++++++++++++++---- .../TestHunspellRepositoryTestCases.java | 2 +- .../analysis/hunspell/TestPerformance.java | 24 ++- .../apache/lucene/util/RamUsageEstimator.java | 17 +- .../apache/lucene/util/RamUsageTester.java | 22 ++- 7 files changed, 200 insertions(+), 63 deletions(-) diff --git a/gradle/testing/randomization/policies/tests.policy b/gradle/testing/randomization/policies/tests.policy index e17af8e78b6..469892c90cc 100644 --- a/gradle/testing/randomization/policies/tests.policy +++ b/gradle/testing/randomization/policies/tests.policy @@ -91,10 +91,12 @@ grant { // allows LuceneTestCase#runWithRestrictedPermissions to execute with lower (or no) permission permission java.security.SecurityPermission "createAccessControlContext"; - // Some Hunspell tests may read from external files specified in system properties + // Hunspell regression and validation tests can read from external files + // specified in system properties. permission java.io.FilePermission "${hunspell.repo.path}${/}-", "read"; - permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read"; permission java.io.FilePermission "${hunspell.corpora}${/}-", "read"; + permission java.io.FilePermission "${hunspell.dictionaries}", "read"; + permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read"; }; // Permissions to support ant build diff --git a/lucene/analysis/common/build.gradle b/lucene/analysis/common/build.gradle index a44152ccaf9..24c949f3f8d 100644 --- a/lucene/analysis/common/build.gradle +++ b/lucene/analysis/common/build.gradle @@ -23,3 +23,18 @@ dependencies { api project(':lucene:core') testImplementation project(':lucene:test-framework') } + +// Pass all hunspell-tests-specific project properties to tests as system properties. +tasks.withType(Test) { + [ + "hunspell.dictionaries", + "hunspell.corpora", + "hunspell.repo.path" + ].each { + def val = propertyOrDefault(it, null) + if (val != null) { + logger.lifecycle("Passing property: ${it}=${val}") + systemProperty it, val + } + } +} \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java index 886272c396d..c2676631b25 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java @@ -16,35 +16,52 @@ */ package org.apache.lucene.analysis.hunspell; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.text.ParseException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.lucene.store.BaseDirectoryWrapper; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks; +import org.apache.lucene.util.NamedThreadFactory; import org.apache.lucene.util.RamUsageTester; import org.junit.Assume; import org.junit.Ignore; /** - * Loads all dictionaries from the directory specified in {@code -Dhunspell.dictionaries=...} and - * prints their memory usage. All *.aff files are traversed directly inside the given directory or - * in its immediate subdirectories. Each *.aff file must have a same-named sibling *.dic file. For - * examples of such directories, refer to the {@link org.apache.lucene.analysis.hunspell package - * documentation} + * Loads all dictionaries from the directory specified in {@code hunspell.dictionaries} system + * property and prints their memory usage. All *.aff files are traversed recursively inside the + * given directory. Each *.aff file must have a same-named sibling *.dic file. For examples of such + * directories, refer to the {@link org.apache.lucene.analysis.hunspell package documentation}. */ -@Ignore("enable manually") @SuppressSysoutChecks(bugUrl = "prints important memory utilization stats per dictionary") public class TestAllDictionaries extends LuceneTestCase { - static Stream findAllAffixFiles() throws IOException { String dicDir = System.getProperty("hunspell.dictionaries"); - Assume.assumeFalse("Missing -Dhunspell.dictionaries=...", dicDir == null); - return Files.walk(Path.of(dicDir), 2).filter(f -> f.toString().endsWith(".aff")); + Assume.assumeFalse( + "Requires Hunspell dictionaries at -Dhunspell.dictionaries=...", dicDir == null); + Path dicPath = Paths.get(dicDir); + return Files.walk(dicPath).filter(f -> f.toString().endsWith(".aff")).sorted(); } static Dictionary loadDictionary(Path aff) throws IOException, ParseException { @@ -58,43 +75,121 @@ public class TestAllDictionaries extends LuceneTestCase { } } - public void testDictionariesLoadSuccessfully() throws Exception { - int failures = 0; - for (Path aff : findAllAffixFiles().collect(Collectors.toList())) { - try { - System.out.println(aff + "\t" + memoryUsage(loadDictionary(aff))); - } catch (Throwable e) { - failures++; - System.err.println("While checking " + aff + ":"); - e.printStackTrace(); - } + /** Hack bais to expose current position. */ + private static class ExposePosition extends ByteArrayInputStream { + public ExposePosition(byte[] buf) { + super(buf); + } + + public long position() { + return super.pos; + } + } + + @Ignore + public void testMaxPrologueNeeded() throws Exception { + AtomicBoolean failTest = new AtomicBoolean(); + + Map> global = new LinkedHashMap<>(); + for (Path aff : findAllAffixFiles().collect(Collectors.toList())) { + Map> local = new LinkedHashMap<>(); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (ExposePosition is = new ExposePosition(Files.readAllBytes(aff))) { + int chr; + while ((chr = is.read()) >= 0) { + baos.write(chr); + + if (chr == '\n') { + String line = baos.toString(StandardCharsets.ISO_8859_1); + if (!line.isBlank()) { + String firstWord = line.split("\\s")[0]; + switch (firstWord) { + case "SET": + case "FLAG": + local.computeIfAbsent(firstWord, (k) -> new ArrayList<>()).add(is.position()); + global.computeIfAbsent(firstWord, (k) -> new ArrayList<>()).add(is.position()); + break; + } + } + + baos.reset(); + } + } + } + + local.forEach( + (flag, positions) -> { + if (positions.size() > 1) { + System.out.format( + Locale.ROOT, + "Flag %s at more than one position in %s: %s%n", + flag, + aff, + positions); + failTest.set(true); + } + }); + } + + global.forEach( + (flag, positions) -> { + long max = positions.stream().mapToLong(v -> v).max().orElse(0); + System.out.printf(Locale.ROOT, "Flag %s at maximum offset %s%n", flag, max); + }); + + if (failTest.get()) { + throw new AssertionError("Duplicate flags were present in at least one .aff file."); + } + } + + public void testDictionariesLoadSuccessfully() throws Exception { + int threads = Runtime.getRuntime().availableProcessors(); + ExecutorService executor = + Executors.newFixedThreadPool(threads, new NamedThreadFactory("dictCheck-")); + try { + List failures = Collections.synchronizedList(new ArrayList<>()); + Function process = + (Path aff) -> { + try { + System.out.println(aff + "\t" + memoryUsage(loadDictionary(aff))); + } catch (Throwable e) { + failures.add(aff); + System.err.println("While checking " + aff + ":"); + e.printStackTrace(); + } + return null; + }; + + for (Future future : + executor.invokeAll( + findAllAffixFiles() + .map(aff -> (Callable) () -> process.apply(aff)) + .collect(Collectors.toList()))) { + future.get(); + } + + if (!failures.isEmpty()) { + throw new AssertionError( + "Certain dictionaries failed to parse:\n - " + + failures.stream() + .map(path -> path.toAbsolutePath().toString()) + .collect(Collectors.joining("\n - "))); + } + } finally { + executor.shutdown(); + executor.awaitTermination(1, TimeUnit.MINUTES); } - assertEquals(failures + " failures!", 0, failures); } private static String memoryUsage(Dictionary dic) { return RamUsageTester.humanSizeOf(dic) + "\t(" - + "words=" - + RamUsageTester.humanSizeOf(dic.words) - + ", " - + "flags=" - + RamUsageTester.humanSizeOf(dic.flagLookup) - + ", " - + "strips=" - + RamUsageTester.humanSizeOf(dic.stripData) - + ", " - + "conditions=" - + RamUsageTester.humanSizeOf(dic.patterns) - + ", " - + "affixData=" - + RamUsageTester.humanSizeOf(dic.affixData) - + ", " - + "prefixes=" - + RamUsageTester.humanSizeOf(dic.prefixes) - + ", " - + "suffixes=" - + RamUsageTester.humanSizeOf(dic.suffixes) - + ")"; + + ("words=" + RamUsageTester.humanSizeOf(dic.words) + ", ") + + ("flags=" + RamUsageTester.humanSizeOf(dic.flagLookup) + ", ") + + ("strips=" + RamUsageTester.humanSizeOf(dic.stripData) + ", ") + + ("conditions=" + RamUsageTester.humanSizeOf(dic.patterns) + ", ") + + ("affixData=" + RamUsageTester.humanSizeOf(dic.affixData) + ", ") + + ("prefixes=" + RamUsageTester.humanSizeOf(dic.prefixes) + ", ") + + ("suffixes=" + RamUsageTester.humanSizeOf(dic.suffixes) + ")"); } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellRepositoryTestCases.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellRepositoryTestCases.java index 048dc044ba8..004ef735522 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellRepositoryTestCases.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellRepositoryTestCases.java @@ -32,7 +32,7 @@ import org.junit.runners.Parameterized; /** * Same as {@link SpellCheckerTest}, but checks all Hunspell's test data. The path to the checked - * out Hunspell repository should be in {@code -Dhunspell.repo.path=...} system property. + * out Hunspell repository should be in {@code hunspell.repo.path} system property. */ @RunWith(Parameterized.class) public class TestHunspellRepositoryTestCases { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java index 33da1ca0299..e26cae75391 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java @@ -24,13 +24,15 @@ import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; import java.util.function.Consumer; import java.util.regex.Pattern; import org.apache.lucene.util.LuceneTestCase; import org.junit.Assume; -import org.junit.Ignore; +import org.junit.AssumptionViolatedException; +import org.junit.BeforeClass; import org.junit.Test; /** @@ -40,8 +42,15 @@ import org.junit.Test; * en.txt}) in a directory specified in {@code -Dhunspell.corpora=...} */ @TestCaseOrdering(TestCaseOrdering.AlphabeticOrder.class) -@Ignore("enable manually") public class TestPerformance extends LuceneTestCase { + private static Path corporaDir; + + @BeforeClass + public static void resolveCorpora() { + String dir = System.getProperty("hunspell.corpora"); + Assume.assumeFalse("Requires test word corpora at -Dhunspell.corpora=...", dir == null); + corporaDir = Paths.get(dir); + } @Test public void en() throws Exception { @@ -60,6 +69,7 @@ public class TestPerformance extends LuceneTestCase { private void checkPerformance(String code, int wordCount) throws Exception { Path aff = findAffFile(code); + Dictionary dictionary = TestAllDictionaries.loadDictionary(aff); System.out.println("Loaded " + aff); @@ -92,15 +102,17 @@ public class TestPerformance extends LuceneTestCase { return code.equals(Dictionary.extractLanguageCode(parentName)); }) .findFirst() - .orElseThrow(() -> new IllegalArgumentException("Cannot find aff/dic for " + code)); + .orElseThrow( + () -> new AssumptionViolatedException("Ignored, cannot find aff/dic for: " + code)); } private List loadWords(String code, int wordCount, Dictionary dictionary) throws IOException { - String corpusDir = System.getProperty("hunspell.corpora"); - Assume.assumeFalse("", corpusDir == null); + Path dataPath = corporaDir.resolve(code + ".txt"); + if (!Files.isReadable(dataPath)) { + throw new AssumptionViolatedException("Missing text corpora at: " + dataPath); + } - Path dataPath = Path.of(corpusDir).resolve(code + ".txt"); List words = new ArrayList<>(); try (InputStream stream = Files.newInputStream(dataPath)) { BufferedReader reader = diff --git a/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java b/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java index 714054618c5..6c9a75bc614 100644 --- a/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java +++ b/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java @@ -20,6 +20,7 @@ import java.lang.reflect.Array; import java.lang.reflect.Field; import java.lang.reflect.Method; import java.lang.reflect.Modifier; +import java.security.AccessControlException; import java.security.AccessController; import java.security.PrivilegedAction; import java.text.DecimalFormat; @@ -527,14 +528,14 @@ public final class RamUsageEstimator { // Walk type hierarchy for (; clazz != null; clazz = clazz.getSuperclass()) { final Class target = clazz; - final Field[] fields = - AccessController.doPrivileged( - new PrivilegedAction() { - @Override - public Field[] run() { - return target.getDeclaredFields(); - } - }); + final Field[] fields; + try { + fields = + AccessController.doPrivileged((PrivilegedAction) target::getDeclaredFields); + } catch (AccessControlException e) { + throw new RuntimeException("Can't access fields of class: " + target, e); + } + for (Field f : fields) { if (!Modifier.isStatic(f.getModifiers())) { size = adjustForField(size, f); diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/RamUsageTester.java b/lucene/test-framework/src/java/org/apache/lucene/util/RamUsageTester.java index 39d25563c88..2c5c6aff1ed 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/RamUsageTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/RamUsageTester.java @@ -23,11 +23,14 @@ import java.lang.reflect.Field; import java.lang.reflect.Modifier; import java.nio.ByteBuffer; import java.nio.ByteOrder; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; import java.nio.file.Path; import java.security.AccessController; import java.security.PrivilegedAction; import java.util.AbstractList; import java.util.ArrayList; +import java.util.BitSet; import java.util.Collection; import java.util.Collections; import java.util.HashMap; @@ -151,6 +154,14 @@ public final class RamUsageTester { ArrayList stack, Object ob, Class obClazz) { + + // Ignore JDK objects we can't access or handle properly. + Predicate isIgnorable = + (clazz) -> (clazz instanceof CharsetEncoder) || (clazz instanceof CharsetDecoder); + if (isIgnorable.test(ob)) { + return accumulator.accumulateObject(ob, 0, Collections.emptyMap(), stack); + } + /* * Consider an object. Push any references it has to the processing stack * and accumulate this object's shallow size. @@ -159,10 +170,7 @@ public final class RamUsageTester { if (Constants.JRE_IS_MINIMUM_JAVA9) { long alignedShallowInstanceSize = RamUsageEstimator.shallowSizeOf(ob); - Predicate> isJavaModule = - (clazz) -> { - return clazz.getName().startsWith("java."); - }; + Predicate> isJavaModule = (clazz) -> clazz.getName().startsWith("java."); // Java 9: Best guess for some known types, as we cannot precisely look into runtime // classes: @@ -274,13 +282,17 @@ public final class RamUsageTester { v.length())); // may not be correct with Java 9's compact strings! a(StringBuilder.class, v -> charArraySize(v.capacity())); a(StringBuffer.class, v -> charArraySize(v.capacity())); + // Approximate the underlying long[] buffer. + a(BitSet.class, v -> (v.size() / Byte.SIZE)); // Types with large buffers: a(ByteArrayOutputStream.class, v -> byteArraySize(v.size())); // For File and Path, we just take the length of String representation as // approximation: a(File.class, v -> charArraySize(v.toString().length())); a(Path.class, v -> charArraySize(v.toString().length())); - a(ByteOrder.class, v -> 0); // Instances of ByteOrder are constants + + // Ignorable JDK classes. + a(ByteOrder.class, v -> 0); } @SuppressWarnings("unchecked")