From 903782d75656c63de018af6c102b14dd2cf2748d Mon Sep 17 00:00:00 2001
From: Dawid Weiss <dawid.weiss@carrotsearch.com>
Date: Mon, 8 Feb 2021 10:50:25 +0100
Subject: [PATCH] LUCENE-9727: build side support for running Hunspell tests.
 (#2313)

---
 .../randomization/policies/tests.policy       |   6 +-
 lucene/analysis/common/build.gradle           |  15 ++
 .../hunspell/TestAllDictionaries.java         | 177 ++++++++++++++----
 .../TestHunspellRepositoryTestCases.java      |   2 +-
 .../analysis/hunspell/TestPerformance.java    |  24 ++-
 .../apache/lucene/util/RamUsageEstimator.java |  17 +-
 .../apache/lucene/util/RamUsageTester.java    |  22 ++-
 7 files changed, 200 insertions(+), 63 deletions(-)

diff --git a/gradle/testing/randomization/policies/tests.policy b/gradle/testing/randomization/policies/tests.policy
index e17af8e78b6..469892c90cc 100644
--- a/gradle/testing/randomization/policies/tests.policy
+++ b/gradle/testing/randomization/policies/tests.policy
@@ -91,10 +91,12 @@ grant {
   // allows LuceneTestCase#runWithRestrictedPermissions to execute with lower (or no) permission
   permission java.security.SecurityPermission "createAccessControlContext";
 
-  // Some Hunspell tests may read from external files specified in system properties
+  // Hunspell regression and validation tests can read from external files
+  // specified in system properties.
   permission java.io.FilePermission "${hunspell.repo.path}${/}-", "read";
-  permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read";
   permission java.io.FilePermission "${hunspell.corpora}${/}-", "read";
+  permission java.io.FilePermission "${hunspell.dictionaries}", "read";
+  permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read";
 };
 
 // Permissions to support ant build
diff --git a/lucene/analysis/common/build.gradle b/lucene/analysis/common/build.gradle
index a44152ccaf9..24c949f3f8d 100644
--- a/lucene/analysis/common/build.gradle
+++ b/lucene/analysis/common/build.gradle
@@ -23,3 +23,18 @@ dependencies {
   api project(':lucene:core')
   testImplementation project(':lucene:test-framework')
 }
+
+// Pass all hunspell-tests-specific project properties to tests as system properties.
+tasks.withType(Test) {
+  [
+      "hunspell.dictionaries",
+      "hunspell.corpora",
+      "hunspell.repo.path"
+  ].each {
+    def val = propertyOrDefault(it, null)
+    if (val != null) {
+      logger.lifecycle("Passing property: ${it}=${val}")
+      systemProperty it, val
+    }
+  }
+}
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
index 886272c396d..c2676631b25 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
@@ -16,35 +16,52 @@
  */
 package org.apache.lucene.analysis.hunspell;
 
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.text.ParseException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.lucene.store.BaseDirectoryWrapper;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
+import org.apache.lucene.util.NamedThreadFactory;
 import org.apache.lucene.util.RamUsageTester;
 import org.junit.Assume;
 import org.junit.Ignore;
 
 /**
- * Loads all dictionaries from the directory specified in {@code -Dhunspell.dictionaries=...} and
- * prints their memory usage. All *.aff files are traversed directly inside the given directory or
- * in its immediate subdirectories. Each *.aff file must have a same-named sibling *.dic file. For
- * examples of such directories, refer to the {@link org.apache.lucene.analysis.hunspell package
- * documentation}
+ * Loads all dictionaries from the directory specified in {@code hunspell.dictionaries} system
+ * property and prints their memory usage. All *.aff files are traversed recursively inside the
+ * given directory. Each *.aff file must have a same-named sibling *.dic file. For examples of such
+ * directories, refer to the {@link org.apache.lucene.analysis.hunspell package documentation}.
  */
-@Ignore("enable manually")
 @SuppressSysoutChecks(bugUrl = "prints important memory utilization stats per dictionary")
 public class TestAllDictionaries extends LuceneTestCase {
-
   static Stream<Path> findAllAffixFiles() throws IOException {
     String dicDir = System.getProperty("hunspell.dictionaries");
-    Assume.assumeFalse("Missing -Dhunspell.dictionaries=...", dicDir == null);
-    return Files.walk(Path.of(dicDir), 2).filter(f -> f.toString().endsWith(".aff"));
+    Assume.assumeFalse(
+        "Requires Hunspell dictionaries at -Dhunspell.dictionaries=...", dicDir == null);
+    Path dicPath = Paths.get(dicDir);
+    return Files.walk(dicPath).filter(f -> f.toString().endsWith(".aff")).sorted();
   }
 
   static Dictionary loadDictionary(Path aff) throws IOException, ParseException {
@@ -58,43 +75,121 @@ public class TestAllDictionaries extends LuceneTestCase {
     }
   }
 
-  public void testDictionariesLoadSuccessfully() throws Exception {
-    int failures = 0;
-    for (Path aff : findAllAffixFiles().collect(Collectors.toList())) {
-      try {
-        System.out.println(aff + "\t" + memoryUsage(loadDictionary(aff)));
-      } catch (Throwable e) {
-        failures++;
-        System.err.println("While checking " + aff + ":");
-        e.printStackTrace();
-      }
+  /** Hack bais to expose current position. */
+  private static class ExposePosition extends ByteArrayInputStream {
+    public ExposePosition(byte[] buf) {
+      super(buf);
+    }
+
+    public long position() {
+      return super.pos;
+    }
+  }
+
+  @Ignore
+  public void testMaxPrologueNeeded() throws Exception {
+    AtomicBoolean failTest = new AtomicBoolean();
+
+    Map<String, List<Long>> global = new LinkedHashMap<>();
+    for (Path aff : findAllAffixFiles().collect(Collectors.toList())) {
+      Map<String, List<Long>> local = new LinkedHashMap<>();
+      ByteArrayOutputStream baos = new ByteArrayOutputStream();
+      try (ExposePosition is = new ExposePosition(Files.readAllBytes(aff))) {
+        int chr;
+        while ((chr = is.read()) >= 0) {
+          baos.write(chr);
+
+          if (chr == '\n') {
+            String line = baos.toString(StandardCharsets.ISO_8859_1);
+            if (!line.isBlank()) {
+              String firstWord = line.split("\\s")[0];
+              switch (firstWord) {
+                case "SET":
+                case "FLAG":
+                  local.computeIfAbsent(firstWord, (k) -> new ArrayList<>()).add(is.position());
+                  global.computeIfAbsent(firstWord, (k) -> new ArrayList<>()).add(is.position());
+                  break;
+              }
+            }
+
+            baos.reset();
+          }
+        }
+      }
+
+      local.forEach(
+          (flag, positions) -> {
+            if (positions.size() > 1) {
+              System.out.format(
+                  Locale.ROOT,
+                  "Flag %s at more than one position in %s: %s%n",
+                  flag,
+                  aff,
+                  positions);
+              failTest.set(true);
+            }
+          });
+    }
+
+    global.forEach(
+        (flag, positions) -> {
+          long max = positions.stream().mapToLong(v -> v).max().orElse(0);
+          System.out.printf(Locale.ROOT, "Flag %s at maximum offset %s%n", flag, max);
+        });
+
+    if (failTest.get()) {
+      throw new AssertionError("Duplicate flags were present in at least one .aff file.");
+    }
+  }
+
+  public void testDictionariesLoadSuccessfully() throws Exception {
+    int threads = Runtime.getRuntime().availableProcessors();
+    ExecutorService executor =
+        Executors.newFixedThreadPool(threads, new NamedThreadFactory("dictCheck-"));
+    try {
+      List<Path> failures = Collections.synchronizedList(new ArrayList<>());
+      Function<Path, Void> process =
+          (Path aff) -> {
+            try {
+              System.out.println(aff + "\t" + memoryUsage(loadDictionary(aff)));
+            } catch (Throwable e) {
+              failures.add(aff);
+              System.err.println("While checking " + aff + ":");
+              e.printStackTrace();
+            }
+            return null;
+          };
+
+      for (Future<?> future :
+          executor.invokeAll(
+              findAllAffixFiles()
+                  .map(aff -> (Callable<?>) () -> process.apply(aff))
+                  .collect(Collectors.toList()))) {
+        future.get();
+      }
+
+      if (!failures.isEmpty()) {
+        throw new AssertionError(
+            "Certain dictionaries failed to parse:\n  - "
+                + failures.stream()
+                    .map(path -> path.toAbsolutePath().toString())
+                    .collect(Collectors.joining("\n  - ")));
+      }
+    } finally {
+      executor.shutdown();
+      executor.awaitTermination(1, TimeUnit.MINUTES);
     }
-    assertEquals(failures + " failures!", 0, failures);
   }
 
   private static String memoryUsage(Dictionary dic) {
     return RamUsageTester.humanSizeOf(dic)
         + "\t("
-        + "words="
-        + RamUsageTester.humanSizeOf(dic.words)
-        + ", "
-        + "flags="
-        + RamUsageTester.humanSizeOf(dic.flagLookup)
-        + ", "
-        + "strips="
-        + RamUsageTester.humanSizeOf(dic.stripData)
-        + ", "
-        + "conditions="
-        + RamUsageTester.humanSizeOf(dic.patterns)
-        + ", "
-        + "affixData="
-        + RamUsageTester.humanSizeOf(dic.affixData)
-        + ", "
-        + "prefixes="
-        + RamUsageTester.humanSizeOf(dic.prefixes)
-        + ", "
-        + "suffixes="
-        + RamUsageTester.humanSizeOf(dic.suffixes)
-        + ")";
+        + ("words=" + RamUsageTester.humanSizeOf(dic.words) + ", ")
+        + ("flags=" + RamUsageTester.humanSizeOf(dic.flagLookup) + ", ")
+        + ("strips=" + RamUsageTester.humanSizeOf(dic.stripData) + ", ")
+        + ("conditions=" + RamUsageTester.humanSizeOf(dic.patterns) + ", ")
+        + ("affixData=" + RamUsageTester.humanSizeOf(dic.affixData) + ", ")
+        + ("prefixes=" + RamUsageTester.humanSizeOf(dic.prefixes) + ", ")
+        + ("suffixes=" + RamUsageTester.humanSizeOf(dic.suffixes) + ")");
   }
 }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellRepositoryTestCases.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellRepositoryTestCases.java
index 048dc044ba8..004ef735522 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellRepositoryTestCases.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellRepositoryTestCases.java
@@ -32,7 +32,7 @@ import org.junit.runners.Parameterized;
 
 /**
  * Same as {@link SpellCheckerTest}, but checks all Hunspell's test data. The path to the checked
- * out Hunspell repository should be in {@code -Dhunspell.repo.path=...} system property.
+ * out Hunspell repository should be in {@code hunspell.repo.path} system property.
  */
 @RunWith(Parameterized.class)
 public class TestHunspellRepositoryTestCases {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
index 33da1ca0299..e26cae75391 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@@ -24,13 +24,15 @@ import java.io.InputStreamReader;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.function.Consumer;
 import java.util.regex.Pattern;
 import org.apache.lucene.util.LuceneTestCase;
 import org.junit.Assume;
-import org.junit.Ignore;
+import org.junit.AssumptionViolatedException;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 /**
@@ -40,8 +42,15 @@ import org.junit.Test;
  * en.txt}) in a directory specified in {@code -Dhunspell.corpora=...}
  */
 @TestCaseOrdering(TestCaseOrdering.AlphabeticOrder.class)
-@Ignore("enable manually")
 public class TestPerformance extends LuceneTestCase {
+  private static Path corporaDir;
+
+  @BeforeClass
+  public static void resolveCorpora() {
+    String dir = System.getProperty("hunspell.corpora");
+    Assume.assumeFalse("Requires test word corpora at -Dhunspell.corpora=...", dir == null);
+    corporaDir = Paths.get(dir);
+  }
 
   @Test
   public void en() throws Exception {
@@ -60,6 +69,7 @@ public class TestPerformance extends LuceneTestCase {
 
   private void checkPerformance(String code, int wordCount) throws Exception {
     Path aff = findAffFile(code);
+
     Dictionary dictionary = TestAllDictionaries.loadDictionary(aff);
     System.out.println("Loaded " + aff);
 
@@ -92,15 +102,17 @@ public class TestPerformance extends LuceneTestCase {
               return code.equals(Dictionary.extractLanguageCode(parentName));
             })
         .findFirst()
-        .orElseThrow(() -> new IllegalArgumentException("Cannot find aff/dic for " + code));
+        .orElseThrow(
+            () -> new AssumptionViolatedException("Ignored, cannot find aff/dic for: " + code));
   }
 
   private List<String> loadWords(String code, int wordCount, Dictionary dictionary)
       throws IOException {
-    String corpusDir = System.getProperty("hunspell.corpora");
-    Assume.assumeFalse("", corpusDir == null);
+    Path dataPath = corporaDir.resolve(code + ".txt");
+    if (!Files.isReadable(dataPath)) {
+      throw new AssumptionViolatedException("Missing text corpora at: " + dataPath);
+    }
 
-    Path dataPath = Path.of(corpusDir).resolve(code + ".txt");
     List<String> words = new ArrayList<>();
     try (InputStream stream = Files.newInputStream(dataPath)) {
       BufferedReader reader =
diff --git a/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java b/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java
index 714054618c5..6c9a75bc614 100644
--- a/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java
+++ b/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java
@@ -20,6 +20,7 @@ import java.lang.reflect.Array;
 import java.lang.reflect.Field;
 import java.lang.reflect.Method;
 import java.lang.reflect.Modifier;
+import java.security.AccessControlException;
 import java.security.AccessController;
 import java.security.PrivilegedAction;
 import java.text.DecimalFormat;
@@ -527,14 +528,14 @@ public final class RamUsageEstimator {
     // Walk type hierarchy
     for (; clazz != null; clazz = clazz.getSuperclass()) {
       final Class<?> target = clazz;
-      final Field[] fields =
-          AccessController.doPrivileged(
-              new PrivilegedAction<Field[]>() {
-                @Override
-                public Field[] run() {
-                  return target.getDeclaredFields();
-                }
-              });
+      final Field[] fields;
+      try {
+        fields =
+            AccessController.doPrivileged((PrivilegedAction<Field[]>) target::getDeclaredFields);
+      } catch (AccessControlException e) {
+        throw new RuntimeException("Can't access fields of class: " + target, e);
+      }
+
       for (Field f : fields) {
         if (!Modifier.isStatic(f.getModifiers())) {
           size = adjustForField(size, f);
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/RamUsageTester.java b/lucene/test-framework/src/java/org/apache/lucene/util/RamUsageTester.java
index 39d25563c88..2c5c6aff1ed 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/RamUsageTester.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/RamUsageTester.java
@@ -23,11 +23,14 @@ import java.lang.reflect.Field;
 import java.lang.reflect.Modifier;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
 import java.nio.file.Path;
 import java.security.AccessController;
 import java.security.PrivilegedAction;
 import java.util.AbstractList;
 import java.util.ArrayList;
+import java.util.BitSet;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
@@ -151,6 +154,14 @@ public final class RamUsageTester {
       ArrayList<Object> stack,
       Object ob,
       Class<?> obClazz) {
+
+    // Ignore JDK objects we can't access or handle properly.
+    Predicate<Object> isIgnorable =
+        (clazz) -> (clazz instanceof CharsetEncoder) || (clazz instanceof CharsetDecoder);
+    if (isIgnorable.test(ob)) {
+      return accumulator.accumulateObject(ob, 0, Collections.emptyMap(), stack);
+    }
+
     /*
      * Consider an object. Push any references it has to the processing stack
      * and accumulate this object's shallow size.
@@ -159,10 +170,7 @@ public final class RamUsageTester {
       if (Constants.JRE_IS_MINIMUM_JAVA9) {
         long alignedShallowInstanceSize = RamUsageEstimator.shallowSizeOf(ob);
 
-        Predicate<Class<?>> isJavaModule =
-            (clazz) -> {
-              return clazz.getName().startsWith("java.");
-            };
+        Predicate<Class<?>> isJavaModule = (clazz) -> clazz.getName().startsWith("java.");
 
         // Java 9: Best guess for some known types, as we cannot precisely look into runtime
         // classes:
@@ -274,13 +282,17 @@ public final class RamUsageTester {
                           v.length())); // may not be correct with Java 9's compact strings!
               a(StringBuilder.class, v -> charArraySize(v.capacity()));
               a(StringBuffer.class, v -> charArraySize(v.capacity()));
+              // Approximate the underlying long[] buffer.
+              a(BitSet.class, v -> (v.size() / Byte.SIZE));
               // Types with large buffers:
               a(ByteArrayOutputStream.class, v -> byteArraySize(v.size()));
               // For File and Path, we just take the length of String representation as
               // approximation:
               a(File.class, v -> charArraySize(v.toString().length()));
               a(Path.class, v -> charArraySize(v.toString().length()));
-              a(ByteOrder.class, v -> 0); // Instances of ByteOrder are constants
+
+              // Ignorable JDK classes.
+              a(ByteOrder.class, v -> 0);
             }
 
             @SuppressWarnings("unchecked")