diff --git a/gradle/testing/randomization/policies/tests.policy b/gradle/testing/randomization/policies/tests.policy index 1103dd326c2..e17af8e78b6 100644 --- a/gradle/testing/randomization/policies/tests.policy +++ b/gradle/testing/randomization/policies/tests.policy @@ -94,6 +94,7 @@ grant { // Some Hunspell tests may read from external files specified in system properties permission java.io.FilePermission "${hunspell.repo.path}${/}-", "read"; permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read"; + permission java.io.FilePermission "${hunspell.corpora}${/}-", "read"; }; // Permissions to support ant build diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index ebb76ee9eab..c4d902ef425 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -109,6 +109,8 @@ public class Dictionary { char[] stripData; int[] stripOffsets; + String wordChars = ""; + // 4 chars per affix, each char representing an unsigned 2-byte integer char[] affixData = new char[32]; private int currentAffix = 0; @@ -384,11 +386,12 @@ public class Dictionary { fullStrip = true; } else if ("LANG".equals(firstWord)) { language = singleArgument(reader, line); - int underscore = language.indexOf("_"); - String langCode = underscore < 0 ? language : language.substring(0, underscore); + String langCode = extractLanguageCode(language); alternateCasing = langCode.equals("tr") || langCode.equals("az"); } else if ("BREAK".equals(firstWord)) { breaks = parseBreaks(reader, line); + } else if ("WORDCHARS".equals(firstWord)) { + wordChars = singleArgument(reader, line); } else if ("TRY".equals(firstWord)) { tryChars = singleArgument(reader, line); } else if ("REP".equals(firstWord)) { @@ -460,6 +463,11 @@ public class Dictionary { stripOffsets[currentIndex] = currentOffset; } + static String extractLanguageCode(String isoCode) { + int underscore = isoCode.indexOf("_"); + return underscore < 0 ? isoCode : isoCode.substring(0, underscore); + } + private String singleArgument(LineNumberReader reader, String line) throws ParseException { return splitBySpace(reader, line, 2)[1]; } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java index ad621af8961..886272c396d 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java @@ -21,8 +21,8 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.text.ParseException; -import java.util.List; import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.lucene.store.BaseDirectoryWrapper; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks; @@ -41,15 +41,13 @@ import org.junit.Ignore; @SuppressSysoutChecks(bugUrl = "prints important memory utilization stats per dictionary") public class TestAllDictionaries extends LuceneTestCase { - private static List findAllAffixFiles() throws IOException { + static Stream findAllAffixFiles() throws IOException { String dicDir = System.getProperty("hunspell.dictionaries"); Assume.assumeFalse("Missing -Dhunspell.dictionaries=...", dicDir == null); - return Files.walk(Path.of(dicDir), 2) - .filter(f -> f.toString().endsWith(".aff")) - .collect(Collectors.toList()); + return Files.walk(Path.of(dicDir), 2).filter(f -> f.toString().endsWith(".aff")); } - private static Dictionary loadDictionary(Path aff) throws IOException, ParseException { + static Dictionary loadDictionary(Path aff) throws IOException, ParseException { String affPath = aff.toString(); Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic"); assert Files.exists(dic) : dic; @@ -62,7 +60,7 @@ public class TestAllDictionaries extends LuceneTestCase { public void testDictionariesLoadSuccessfully() throws Exception { int failures = 0; - for (Path aff : findAllAffixFiles()) { + for (Path aff : findAllAffixFiles().collect(Collectors.toList())) { try { System.out.println(aff + "\t" + memoryUsage(loadDictionary(aff))); } catch (Throwable e) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java new file mode 100644 index 00000000000..33da1ca0299 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +import com.carrotsearch.randomizedtesting.annotations.TestCaseOrdering; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.function.Consumer; +import java.util.regex.Pattern; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.Assume; +import org.junit.Ignore; +import org.junit.Test; + +/** + * A test that runs various Hunspell APIs on real dictionaries and relatively large corpora for + * specific languages and prints the execution times. The dictionaries should be set up as in {@link + * TestAllDictionaries}, the corpora should be in files named {@code langCode.txt} (e.g. {@code + * en.txt}) in a directory specified in {@code -Dhunspell.corpora=...} + */ +@TestCaseOrdering(TestCaseOrdering.AlphabeticOrder.class) +@Ignore("enable manually") +public class TestPerformance extends LuceneTestCase { + + @Test + public void en() throws Exception { + checkPerformance("en", 500_000); + } + + @Test + public void de() throws Exception { + checkPerformance("de", 100_000); + } + + @Test + public void fr() throws Exception { + checkPerformance("fr", 20_000); + } + + private void checkPerformance(String code, int wordCount) throws Exception { + Path aff = findAffFile(code); + Dictionary dictionary = TestAllDictionaries.loadDictionary(aff); + System.out.println("Loaded " + aff); + + List words = loadWords(code, wordCount, dictionary); + + Stemmer stemmer = new Stemmer(dictionary); + SpellChecker speller = new SpellChecker(dictionary); + measure( + "Stemming " + code, + blackHole -> { + for (String word : words) { + blackHole.accept(stemmer.stem(word)); + } + }); + measure( + "Spellchecking " + code, + blackHole -> { + for (String word : words) { + blackHole.accept(speller.spell(word)); + } + }); + System.out.println(); + } + + private Path findAffFile(String code) throws IOException { + return TestAllDictionaries.findAllAffixFiles() + .filter( + path -> { + String parentName = path.getParent().getFileName().toString(); + return code.equals(Dictionary.extractLanguageCode(parentName)); + }) + .findFirst() + .orElseThrow(() -> new IllegalArgumentException("Cannot find aff/dic for " + code)); + } + + private List loadWords(String code, int wordCount, Dictionary dictionary) + throws IOException { + String corpusDir = System.getProperty("hunspell.corpora"); + Assume.assumeFalse("", corpusDir == null); + + Path dataPath = Path.of(corpusDir).resolve(code + ".txt"); + List words = new ArrayList<>(); + try (InputStream stream = Files.newInputStream(dataPath)) { + BufferedReader reader = + new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); + while (true) { + String line = reader.readLine(); + if (line == null) break; + + for (String token : line.split("[^a-zA-Z" + Pattern.quote(dictionary.wordChars) + "]+")) { + String word = stripPunctuation(token); + if (word != null) { + words.add(word); + if (words.size() == wordCount) { + return words; + } + } + } + } + } + return words; + } + + private void measure(String what, Iteration iteration) { + Consumer consumer = + o -> { + if (o == null) { + throw new AssertionError(); + } + }; + + // warmup + for (int i = 0; i < 2; i++) { + iteration.run(consumer); + } + + List times = new ArrayList<>(); + for (int i = 0; i < 7; i++) { + long start = System.currentTimeMillis(); + iteration.run(consumer); + times.add(System.currentTimeMillis() - start); + } + System.out.println( + what + + ": average " + + times.stream().mapToLong(Long::longValue).average().orElseThrow() + + ", all times = " + + times); + } + + private interface Iteration { + void run(Consumer blackHole); + } + + static String stripPunctuation(String token) { + int start = 0; + int end = token.length(); + while (start < end && isPunctuation(token.charAt(start))) start++; + while (start < end - 1 && isPunctuation(token.charAt(end - 1))) end--; + return start < end ? token.substring(start, end) : null; + } + + private static boolean isPunctuation(char c) { + return ".!?,\"'’‘".indexOf(c) >= 0; + } +}