LUCENE-9728: Hunspell: add a performance test (#2296)

2021-02-05 09:47:02 +01:00 · 2021-02-05 09:47:02 +01:00 · 82f8d7ba1d
parent 650f16ad5d
commit 82f8d7ba1d
4 changed files with 184 additions and 9 deletions
--- a/gradle/testing/randomization/policies/tests.policy
+++ b/gradle/testing/randomization/policies/tests.policy
@ -94,6 +94,7 @@ grant {
  // Some Hunspell tests may read from external files specified in system properties
  permission java.io.FilePermission "${hunspell.repo.path}${/}-", "read";
  permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read";
+  permission java.io.FilePermission "${hunspell.corpora}${/}-", "read";
 };

 // Permissions to support ant build
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -109,6 +109,8 @@ public class Dictionary {
  char[] stripData;
  int[] stripOffsets;

+  String wordChars = "";
+
  // 4 chars per affix, each char representing an unsigned 2-byte integer
  char[] affixData = new char[32];
  private int currentAffix = 0;
@ -384,11 +386,12 @@ public class Dictionary {
        fullStrip = true;
      } else if ("LANG".equals(firstWord)) {
        language = singleArgument(reader, line);
-        int underscore = language.indexOf("_");
-        String langCode = underscore < 0 ? language : language.substring(0, underscore);
+        String langCode = extractLanguageCode(language);
        alternateCasing = langCode.equals("tr") || langCode.equals("az");
      } else if ("BREAK".equals(firstWord)) {
        breaks = parseBreaks(reader, line);
+      } else if ("WORDCHARS".equals(firstWord)) {
+        wordChars = singleArgument(reader, line);
      } else if ("TRY".equals(firstWord)) {
        tryChars = singleArgument(reader, line);
      } else if ("REP".equals(firstWord)) {
@ -460,6 +463,11 @@ public class Dictionary {
    stripOffsets[currentIndex] = currentOffset;
  }

+  static String extractLanguageCode(String isoCode) {
+    int underscore = isoCode.indexOf("_");
+    return underscore < 0 ? isoCode : isoCode.substring(0, underscore);
+  }
+
  private String singleArgument(LineNumberReader reader, String line) throws ParseException {
    return splitBySpace(reader, line, 2)[1];
  }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
@ -21,8 +21,8 @@ import java.io.InputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.text.ParseException;
-import java.util.List;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;
 import org.apache.lucene.store.BaseDirectoryWrapper;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
@ -41,15 +41,13 @@ import org.junit.Ignore;
@SuppressSysoutChecks(bugUrl = "prints important memory utilization stats per dictionary")
 public class TestAllDictionaries extends LuceneTestCase {

-  private static List<Path> findAllAffixFiles() throws IOException {
+  static Stream<Path> findAllAffixFiles() throws IOException {
    String dicDir = System.getProperty("hunspell.dictionaries");
    Assume.assumeFalse("Missing -Dhunspell.dictionaries=...", dicDir == null);
-    return Files.walk(Path.of(dicDir), 2)
-        .filter(f -> f.toString().endsWith(".aff"))
-        .collect(Collectors.toList());
+    return Files.walk(Path.of(dicDir), 2).filter(f -> f.toString().endsWith(".aff"));
  }

-  private static Dictionary loadDictionary(Path aff) throws IOException, ParseException {
+  static Dictionary loadDictionary(Path aff) throws IOException, ParseException {
    String affPath = aff.toString();
    Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic");
    assert Files.exists(dic) : dic;
@ -62,7 +60,7 @@ public class TestAllDictionaries extends LuceneTestCase {

  public void testDictionariesLoadSuccessfully() throws Exception {
    int failures = 0;
-    for (Path aff : findAllAffixFiles()) {
+    for (Path aff : findAllAffixFiles().collect(Collectors.toList())) {
      try {
        System.out.println(aff + "\t" + memoryUsage(loadDictionary(aff)));
      } catch (Throwable e) {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import com.carrotsearch.randomizedtesting.annotations.TestCaseOrdering;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.Consumer;
+import java.util.regex.Pattern;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Assume;
+import org.junit.Ignore;
+import org.junit.Test;
+
+/**
+ * A test that runs various Hunspell APIs on real dictionaries and relatively large corpora for
+ * specific languages and prints the execution times. The dictionaries should be set up as in {@link
+ * TestAllDictionaries}, the corpora should be in files named {@code langCode.txt} (e.g. {@code
+ * en.txt}) in a directory specified in {@code -Dhunspell.corpora=...}
+ */
+@TestCaseOrdering(TestCaseOrdering.AlphabeticOrder.class)
+@Ignore("enable manually")
+public class TestPerformance extends LuceneTestCase {
+
+  @Test
+  public void en() throws Exception {
+    checkPerformance("en", 500_000);
+  }
+
+  @Test
+  public void de() throws Exception {
+    checkPerformance("de", 100_000);
+  }
+
+  @Test
+  public void fr() throws Exception {
+    checkPerformance("fr", 20_000);
+  }
+
+  private void checkPerformance(String code, int wordCount) throws Exception {
+    Path aff = findAffFile(code);
+    Dictionary dictionary = TestAllDictionaries.loadDictionary(aff);
+    System.out.println("Loaded " + aff);
+
+    List<String> words = loadWords(code, wordCount, dictionary);
+
+    Stemmer stemmer = new Stemmer(dictionary);
+    SpellChecker speller = new SpellChecker(dictionary);
+    measure(
+        "Stemming " + code,
+        blackHole -> {
+          for (String word : words) {
+            blackHole.accept(stemmer.stem(word));
+          }
+        });
+    measure(
+        "Spellchecking " + code,
+        blackHole -> {
+          for (String word : words) {
+            blackHole.accept(speller.spell(word));
+          }
+        });
+    System.out.println();
+  }
+
+  private Path findAffFile(String code) throws IOException {
+    return TestAllDictionaries.findAllAffixFiles()
+        .filter(
+            path -> {
+              String parentName = path.getParent().getFileName().toString();
+              return code.equals(Dictionary.extractLanguageCode(parentName));
+            })
+        .findFirst()
+        .orElseThrow(() -> new IllegalArgumentException("Cannot find aff/dic for " + code));
+  }
+
+  private List<String> loadWords(String code, int wordCount, Dictionary dictionary)
+      throws IOException {
+    String corpusDir = System.getProperty("hunspell.corpora");
+    Assume.assumeFalse("", corpusDir == null);
+
+    Path dataPath = Path.of(corpusDir).resolve(code + ".txt");
+    List<String> words = new ArrayList<>();
+    try (InputStream stream = Files.newInputStream(dataPath)) {
+      BufferedReader reader =
+          new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
+      while (true) {
+        String line = reader.readLine();
+        if (line == null) break;
+
+        for (String token : line.split("[^a-zA-Z" + Pattern.quote(dictionary.wordChars) + "]+")) {
+          String word = stripPunctuation(token);
+          if (word != null) {
+            words.add(word);
+            if (words.size() == wordCount) {
+              return words;
+            }
+          }
+        }
+      }
+    }
+    return words;
+  }
+
+  private void measure(String what, Iteration iteration) {
+    Consumer<Object> consumer =
+        o -> {
+          if (o == null) {
+            throw new AssertionError();
+          }
+        };
+
+    // warmup
+    for (int i = 0; i < 2; i++) {
+      iteration.run(consumer);
+    }
+
+    List<Long> times = new ArrayList<>();
+    for (int i = 0; i < 7; i++) {
+      long start = System.currentTimeMillis();
+      iteration.run(consumer);
+      times.add(System.currentTimeMillis() - start);
+    }
+    System.out.println(
+        what
+            + ": average "
+            + times.stream().mapToLong(Long::longValue).average().orElseThrow()
+            + ", all times = "
+            + times);
+  }
+
+  private interface Iteration {
+    void run(Consumer<Object> blackHole);
+  }
+
+  static String stripPunctuation(String token) {
+    int start = 0;
+    int end = token.length();
+    while (start < end && isPunctuation(token.charAt(start))) start++;
+    while (start < end - 1 && isPunctuation(token.charAt(end - 1))) end--;
+    return start < end ? token.substring(start, end) : null;
+  }
+
+  private static boolean isPunctuation(char c) {
+    return ".!?,\"'’‘".indexOf(c) >= 0;
+  }
+}