LUCENE-9727: build side support for running Hunspell tests. (#2313)

2021-02-08 10:50:25 +01:00 · 2021-02-08 10:50:25 +01:00 · 903782d756
parent 1cc26b6bb4
commit 903782d756
7 changed files with 200 additions and 63 deletions
--- a/gradle/testing/randomization/policies/tests.policy
+++ b/gradle/testing/randomization/policies/tests.policy
@ -91,10 +91,12 @@ grant {
  // allows LuceneTestCase#runWithRestrictedPermissions to execute with lower (or no) permission
  permission java.security.SecurityPermission "createAccessControlContext";

-  // Some Hunspell tests may read from external files specified in system properties
+  // Hunspell regression and validation tests can read from external files
+  // specified in system properties.
  permission java.io.FilePermission "${hunspell.repo.path}${/}-", "read";
-  permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read";
  permission java.io.FilePermission "${hunspell.corpora}${/}-", "read";
+  permission java.io.FilePermission "${hunspell.dictionaries}", "read";
+  permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read";
 };

 // Permissions to support ant build
--- a/lucene/analysis/common/build.gradle
+++ b/lucene/analysis/common/build.gradle
@ -23,3 +23,18 @@ dependencies {
  api project(':lucene:core')
  testImplementation project(':lucene:test-framework')
 }
+
+// Pass all hunspell-tests-specific project properties to tests as system properties.
+tasks.withType(Test) {
+  [
+      "hunspell.dictionaries",
+      "hunspell.corpora",
+      "hunspell.repo.path"
+  ].each {
+    def val = propertyOrDefault(it, null)
+    if (val != null) {
+      logger.lifecycle("Passing property: ${it}=${val}")
+      systemProperty it, val
+    }
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
@ -16,35 +16,52 @@
 */
 package org.apache.lucene.analysis.hunspell;

+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.text.ParseException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.lucene.store.BaseDirectoryWrapper;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
+import org.apache.lucene.util.NamedThreadFactory;
 import org.apache.lucene.util.RamUsageTester;
 import org.junit.Assume;
 import org.junit.Ignore;

 /**
- * Loads all dictionaries from the directory specified in {@code -Dhunspell.dictionaries=...} and
- * prints their memory usage. All *.aff files are traversed directly inside the given directory or
- * in its immediate subdirectories. Each *.aff file must have a same-named sibling *.dic file. For
- * examples of such directories, refer to the {@link org.apache.lucene.analysis.hunspell package
- * documentation}
+ * Loads all dictionaries from the directory specified in {@code hunspell.dictionaries} system
+ * property and prints their memory usage. All *.aff files are traversed recursively inside the
+ * given directory. Each *.aff file must have a same-named sibling *.dic file. For examples of such
+ * directories, refer to the {@link org.apache.lucene.analysis.hunspell package documentation}.
 */
-@Ignore("enable manually")
@SuppressSysoutChecks(bugUrl = "prints important memory utilization stats per dictionary")
 public class TestAllDictionaries extends LuceneTestCase {
-
  static Stream<Path> findAllAffixFiles() throws IOException {
    String dicDir = System.getProperty("hunspell.dictionaries");
-    Assume.assumeFalse("Missing -Dhunspell.dictionaries=...", dicDir == null);
-    return Files.walk(Path.of(dicDir), 2).filter(f -> f.toString().endsWith(".aff"));
+    Assume.assumeFalse(
+        "Requires Hunspell dictionaries at -Dhunspell.dictionaries=...", dicDir == null);
+    Path dicPath = Paths.get(dicDir);
+    return Files.walk(dicPath).filter(f -> f.toString().endsWith(".aff")).sorted();
  }

  static Dictionary loadDictionary(Path aff) throws IOException, ParseException {
@ -58,43 +75,121 @@ public class TestAllDictionaries extends LuceneTestCase {
    }
  }

-  public void testDictionariesLoadSuccessfully() throws Exception {
-    int failures = 0;
-    for (Path aff : findAllAffixFiles().collect(Collectors.toList())) {
-      try {
-        System.out.println(aff + "\t" + memoryUsage(loadDictionary(aff)));
-      } catch (Throwable e) {
-        failures++;
-        System.err.println("While checking " + aff + ":");
-        e.printStackTrace();
-      }
+  /** Hack bais to expose current position. */
+  private static class ExposePosition extends ByteArrayInputStream {
+    public ExposePosition(byte[] buf) {
+      super(buf);
+    }
+
+    public long position() {
+      return super.pos;
+    }
+  }
+
+  @Ignore
+  public void testMaxPrologueNeeded() throws Exception {
+    AtomicBoolean failTest = new AtomicBoolean();
+
+    Map<String, List<Long>> global = new LinkedHashMap<>();
+    for (Path aff : findAllAffixFiles().collect(Collectors.toList())) {
+      Map<String, List<Long>> local = new LinkedHashMap<>();
+      ByteArrayOutputStream baos = new ByteArrayOutputStream();
+      try (ExposePosition is = new ExposePosition(Files.readAllBytes(aff))) {
+        int chr;
+        while ((chr = is.read()) >= 0) {
+          baos.write(chr);
+
+          if (chr == '\n') {
+            String line = baos.toString(StandardCharsets.ISO_8859_1);
+            if (!line.isBlank()) {
+              String firstWord = line.split("\\s")[0];
+              switch (firstWord) {
+                case "SET":
+                case "FLAG":
+                  local.computeIfAbsent(firstWord, (k) -> new ArrayList<>()).add(is.position());
+                  global.computeIfAbsent(firstWord, (k) -> new ArrayList<>()).add(is.position());
+                  break;
+              }
+            }
+
+            baos.reset();
+          }
+        }
+      }
+
+      local.forEach(
+          (flag, positions) -> {
+            if (positions.size() > 1) {
+              System.out.format(
+                  Locale.ROOT,
+                  "Flag %s at more than one position in %s: %s%n",
+                  flag,
+                  aff,
+                  positions);
+              failTest.set(true);
+            }
+          });
+    }
+
+    global.forEach(
+        (flag, positions) -> {
+          long max = positions.stream().mapToLong(v -> v).max().orElse(0);
+          System.out.printf(Locale.ROOT, "Flag %s at maximum offset %s%n", flag, max);
+        });
+
+    if (failTest.get()) {
+      throw new AssertionError("Duplicate flags were present in at least one .aff file.");
+    }
+  }
+
+  public void testDictionariesLoadSuccessfully() throws Exception {
+    int threads = Runtime.getRuntime().availableProcessors();
+    ExecutorService executor =
+        Executors.newFixedThreadPool(threads, new NamedThreadFactory("dictCheck-"));
+    try {
+      List<Path> failures = Collections.synchronizedList(new ArrayList<>());
+      Function<Path, Void> process =
+          (Path aff) -> {
+            try {
+              System.out.println(aff + "\t" + memoryUsage(loadDictionary(aff)));
+            } catch (Throwable e) {
+              failures.add(aff);
+              System.err.println("While checking " + aff + ":");
+              e.printStackTrace();
+            }
+            return null;
+          };
+
+      for (Future<?> future :
+          executor.invokeAll(
+              findAllAffixFiles()
+                  .map(aff -> (Callable<?>) () -> process.apply(aff))
+                  .collect(Collectors.toList()))) {
+        future.get();
+      }
+
+      if (!failures.isEmpty()) {
+        throw new AssertionError(
+            "Certain dictionaries failed to parse:\n  - "
+                + failures.stream()
+                    .map(path -> path.toAbsolutePath().toString())
+                    .collect(Collectors.joining("\n  - ")));
+      }
+    } finally {
+      executor.shutdown();
+      executor.awaitTermination(1, TimeUnit.MINUTES);
    }
-    assertEquals(failures + " failures!", 0, failures);
  }

  private static String memoryUsage(Dictionary dic) {
    return RamUsageTester.humanSizeOf(dic)
        + "\t("
-        + "words="
-        + RamUsageTester.humanSizeOf(dic.words)
-        + ", "
-        + "flags="
-        + RamUsageTester.humanSizeOf(dic.flagLookup)
-        + ", "
-        + "strips="
-        + RamUsageTester.humanSizeOf(dic.stripData)
-        + ", "
-        + "conditions="
-        + RamUsageTester.humanSizeOf(dic.patterns)
-        + ", "
-        + "affixData="
-        + RamUsageTester.humanSizeOf(dic.affixData)
-        + ", "
-        + "prefixes="
-        + RamUsageTester.humanSizeOf(dic.prefixes)
-        + ", "
-        + "suffixes="
-        + RamUsageTester.humanSizeOf(dic.suffixes)
-        + ")";
+        + ("words=" + RamUsageTester.humanSizeOf(dic.words) + ", ")
+        + ("flags=" + RamUsageTester.humanSizeOf(dic.flagLookup) + ", ")
+        + ("strips=" + RamUsageTester.humanSizeOf(dic.stripData) + ", ")
+        + ("conditions=" + RamUsageTester.humanSizeOf(dic.patterns) + ", ")
+        + ("affixData=" + RamUsageTester.humanSizeOf(dic.affixData) + ", ")
+        + ("prefixes=" + RamUsageTester.humanSizeOf(dic.prefixes) + ", ")
+        + ("suffixes=" + RamUsageTester.humanSizeOf(dic.suffixes) + ")");
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellRepositoryTestCases.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellRepositoryTestCases.java
@ -32,7 +32,7 @@ import org.junit.runners.Parameterized;

 /**
 * Same as {@link SpellCheckerTest}, but checks all Hunspell's test data. The path to the checked
- * out Hunspell repository should be in {@code -Dhunspell.repo.path=...} system property.
+ * out Hunspell repository should be in {@code hunspell.repo.path} system property.
 */
@RunWith(Parameterized.class)
 public class TestHunspellRepositoryTestCases {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@ -24,13 +24,15 @@ import java.io.InputStreamReader;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.function.Consumer;
 import java.util.regex.Pattern;
 import org.apache.lucene.util.LuceneTestCase;
 import org.junit.Assume;
-import org.junit.Ignore;
+import org.junit.AssumptionViolatedException;
+import org.junit.BeforeClass;
 import org.junit.Test;

 /**
@ -40,8 +42,15 @@ import org.junit.Test;
 * en.txt}) in a directory specified in {@code -Dhunspell.corpora=...}
 */
@TestCaseOrdering(TestCaseOrdering.AlphabeticOrder.class)
-@Ignore("enable manually")
 public class TestPerformance extends LuceneTestCase {
+  private static Path corporaDir;
+
+  @BeforeClass
+  public static void resolveCorpora() {
+    String dir = System.getProperty("hunspell.corpora");
+    Assume.assumeFalse("Requires test word corpora at -Dhunspell.corpora=...", dir == null);
+    corporaDir = Paths.get(dir);
+  }

  @Test
  public void en() throws Exception {
@ -60,6 +69,7 @@ public class TestPerformance extends LuceneTestCase {

  private void checkPerformance(String code, int wordCount) throws Exception {
    Path aff = findAffFile(code);
+
    Dictionary dictionary = TestAllDictionaries.loadDictionary(aff);
    System.out.println("Loaded " + aff);

@ -92,15 +102,17 @@ public class TestPerformance extends LuceneTestCase {
              return code.equals(Dictionary.extractLanguageCode(parentName));
            })
        .findFirst()
-        .orElseThrow(() -> new IllegalArgumentException("Cannot find aff/dic for " + code));
+        .orElseThrow(
+            () -> new AssumptionViolatedException("Ignored, cannot find aff/dic for: " + code));
  }

  private List<String> loadWords(String code, int wordCount, Dictionary dictionary)
      throws IOException {
-    String corpusDir = System.getProperty("hunspell.corpora");
-    Assume.assumeFalse("", corpusDir == null);
+    Path dataPath = corporaDir.resolve(code + ".txt");
+    if (!Files.isReadable(dataPath)) {
+      throw new AssumptionViolatedException("Missing text corpora at: " + dataPath);
+    }

-    Path dataPath = Path.of(corpusDir).resolve(code + ".txt");
    List<String> words = new ArrayList<>();
    try (InputStream stream = Files.newInputStream(dataPath)) {
      BufferedReader reader =
--- a/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java
+++ b/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java
@ -20,6 +20,7 @@ import java.lang.reflect.Array;
 import java.lang.reflect.Field;
 import java.lang.reflect.Method;
 import java.lang.reflect.Modifier;
+import java.security.AccessControlException;
 import java.security.AccessController;
 import java.security.PrivilegedAction;
 import java.text.DecimalFormat;
@ -527,14 +528,14 @@ public final class RamUsageEstimator {
    // Walk type hierarchy
    for (; clazz != null; clazz = clazz.getSuperclass()) {
      final Class<?> target = clazz;
-      final Field[] fields =
-          AccessController.doPrivileged(
-              new PrivilegedAction<Field[]>() {
-                @Override
-                public Field[] run() {
-                  return target.getDeclaredFields();
-                }
-              });
+      final Field[] fields;
+      try {
+        fields =
+            AccessController.doPrivileged((PrivilegedAction<Field[]>) target::getDeclaredFields);
+      } catch (AccessControlException e) {
+        throw new RuntimeException("Can't access fields of class: " + target, e);
+      }
+
      for (Field f : fields) {
        if (!Modifier.isStatic(f.getModifiers())) {
          size = adjustForField(size, f);
--- a/lucene/test-framework/src/java/org/apache/lucene/util/RamUsageTester.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/RamUsageTester.java
@ -23,11 +23,14 @@ import java.lang.reflect.Field;
 import java.lang.reflect.Modifier;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
 import java.nio.file.Path;
 import java.security.AccessController;
 import java.security.PrivilegedAction;
 import java.util.AbstractList;
 import java.util.ArrayList;
+import java.util.BitSet;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
@ -151,6 +154,14 @@ public final class RamUsageTester {
      ArrayList<Object> stack,
      Object ob,
      Class<?> obClazz) {
+
+    // Ignore JDK objects we can't access or handle properly.
+    Predicate<Object> isIgnorable =
+        (clazz) -> (clazz instanceof CharsetEncoder) || (clazz instanceof CharsetDecoder);
+    if (isIgnorable.test(ob)) {
+      return accumulator.accumulateObject(ob, 0, Collections.emptyMap(), stack);
+    }
+
    /*
     * Consider an object. Push any references it has to the processing stack
     * and accumulate this object's shallow size.
@ -159,10 +170,7 @@ public final class RamUsageTester {
      if (Constants.JRE_IS_MINIMUM_JAVA9) {
        long alignedShallowInstanceSize = RamUsageEstimator.shallowSizeOf(ob);

-        Predicate<Class<?>> isJavaModule =
-            (clazz) -> {
-              return clazz.getName().startsWith("java.");
-            };
+        Predicate<Class<?>> isJavaModule = (clazz) -> clazz.getName().startsWith("java.");

        // Java 9: Best guess for some known types, as we cannot precisely look into runtime
        // classes:
@ -274,13 +282,17 @@ public final class RamUsageTester {
                          v.length())); // may not be correct with Java 9's compact strings!
              a(StringBuilder.class, v -> charArraySize(v.capacity()));
              a(StringBuffer.class, v -> charArraySize(v.capacity()));
+              // Approximate the underlying long[] buffer.
+              a(BitSet.class, v -> (v.size() / Byte.SIZE));
              // Types with large buffers:
              a(ByteArrayOutputStream.class, v -> byteArraySize(v.size()));
              // For File and Path, we just take the length of String representation as
              // approximation:
              a(File.class, v -> charArraySize(v.toString().length()));
              a(Path.class, v -> charArraySize(v.toString().length()));
-              a(ByteOrder.class, v -> 0); // Instances of ByteOrder are constants
+
+              // Ignorable JDK classes.
+              a(ByteOrder.class, v -> 0);
            }

            @SuppressWarnings("unchecked")