LUCENE-9728: Hunspell: add a performance test (#2296)

This commit is contained in:
Peter Gromov 2021-02-05 09:47:02 +01:00 committed by GitHub
parent 650f16ad5d
commit 82f8d7ba1d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 184 additions and 9 deletions

View File

@ -94,6 +94,7 @@ grant {
// Some Hunspell tests may read from external files specified in system properties
permission java.io.FilePermission "${hunspell.repo.path}${/}-", "read";
permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read";
permission java.io.FilePermission "${hunspell.corpora}${/}-", "read";
};
// Permissions to support ant build

View File

@ -109,6 +109,8 @@ public class Dictionary {
char[] stripData;
int[] stripOffsets;
String wordChars = "";
// 4 chars per affix, each char representing an unsigned 2-byte integer
char[] affixData = new char[32];
private int currentAffix = 0;
@ -384,11 +386,12 @@ public class Dictionary {
fullStrip = true;
} else if ("LANG".equals(firstWord)) {
language = singleArgument(reader, line);
int underscore = language.indexOf("_");
String langCode = underscore < 0 ? language : language.substring(0, underscore);
String langCode = extractLanguageCode(language);
alternateCasing = langCode.equals("tr") || langCode.equals("az");
} else if ("BREAK".equals(firstWord)) {
breaks = parseBreaks(reader, line);
} else if ("WORDCHARS".equals(firstWord)) {
wordChars = singleArgument(reader, line);
} else if ("TRY".equals(firstWord)) {
tryChars = singleArgument(reader, line);
} else if ("REP".equals(firstWord)) {
@ -460,6 +463,11 @@ public class Dictionary {
stripOffsets[currentIndex] = currentOffset;
}
static String extractLanguageCode(String isoCode) {
int underscore = isoCode.indexOf("_");
return underscore < 0 ? isoCode : isoCode.substring(0, underscore);
}
private String singleArgument(LineNumberReader reader, String line) throws ParseException {
return splitBySpace(reader, line, 2)[1];
}

View File

@ -21,8 +21,8 @@ import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.ParseException;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.store.BaseDirectoryWrapper;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
@ -41,15 +41,13 @@ import org.junit.Ignore;
@SuppressSysoutChecks(bugUrl = "prints important memory utilization stats per dictionary")
public class TestAllDictionaries extends LuceneTestCase {
private static List<Path> findAllAffixFiles() throws IOException {
static Stream<Path> findAllAffixFiles() throws IOException {
String dicDir = System.getProperty("hunspell.dictionaries");
Assume.assumeFalse("Missing -Dhunspell.dictionaries=...", dicDir == null);
return Files.walk(Path.of(dicDir), 2)
.filter(f -> f.toString().endsWith(".aff"))
.collect(Collectors.toList());
return Files.walk(Path.of(dicDir), 2).filter(f -> f.toString().endsWith(".aff"));
}
private static Dictionary loadDictionary(Path aff) throws IOException, ParseException {
static Dictionary loadDictionary(Path aff) throws IOException, ParseException {
String affPath = aff.toString();
Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic");
assert Files.exists(dic) : dic;
@ -62,7 +60,7 @@ public class TestAllDictionaries extends LuceneTestCase {
public void testDictionariesLoadSuccessfully() throws Exception {
int failures = 0;
for (Path aff : findAllAffixFiles()) {
for (Path aff : findAllAffixFiles().collect(Collectors.toList())) {
try {
System.out.println(aff + "\t" + memoryUsage(loadDictionary(aff)));
} catch (Throwable e) {

View File

@ -0,0 +1,168 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import com.carrotsearch.randomizedtesting.annotations.TestCaseOrdering;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Consumer;
import java.util.regex.Pattern;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Assume;
import org.junit.Ignore;
import org.junit.Test;
/**
* A test that runs various Hunspell APIs on real dictionaries and relatively large corpora for
* specific languages and prints the execution times. The dictionaries should be set up as in {@link
* TestAllDictionaries}, the corpora should be in files named {@code langCode.txt} (e.g. {@code
* en.txt}) in a directory specified in {@code -Dhunspell.corpora=...}
*/
@TestCaseOrdering(TestCaseOrdering.AlphabeticOrder.class)
@Ignore("enable manually")
public class TestPerformance extends LuceneTestCase {
@Test
public void en() throws Exception {
checkPerformance("en", 500_000);
}
@Test
public void de() throws Exception {
checkPerformance("de", 100_000);
}
@Test
public void fr() throws Exception {
checkPerformance("fr", 20_000);
}
private void checkPerformance(String code, int wordCount) throws Exception {
Path aff = findAffFile(code);
Dictionary dictionary = TestAllDictionaries.loadDictionary(aff);
System.out.println("Loaded " + aff);
List<String> words = loadWords(code, wordCount, dictionary);
Stemmer stemmer = new Stemmer(dictionary);
SpellChecker speller = new SpellChecker(dictionary);
measure(
"Stemming " + code,
blackHole -> {
for (String word : words) {
blackHole.accept(stemmer.stem(word));
}
});
measure(
"Spellchecking " + code,
blackHole -> {
for (String word : words) {
blackHole.accept(speller.spell(word));
}
});
System.out.println();
}
private Path findAffFile(String code) throws IOException {
return TestAllDictionaries.findAllAffixFiles()
.filter(
path -> {
String parentName = path.getParent().getFileName().toString();
return code.equals(Dictionary.extractLanguageCode(parentName));
})
.findFirst()
.orElseThrow(() -> new IllegalArgumentException("Cannot find aff/dic for " + code));
}
private List<String> loadWords(String code, int wordCount, Dictionary dictionary)
throws IOException {
String corpusDir = System.getProperty("hunspell.corpora");
Assume.assumeFalse("", corpusDir == null);
Path dataPath = Path.of(corpusDir).resolve(code + ".txt");
List<String> words = new ArrayList<>();
try (InputStream stream = Files.newInputStream(dataPath)) {
BufferedReader reader =
new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
while (true) {
String line = reader.readLine();
if (line == null) break;
for (String token : line.split("[^a-zA-Z" + Pattern.quote(dictionary.wordChars) + "]+")) {
String word = stripPunctuation(token);
if (word != null) {
words.add(word);
if (words.size() == wordCount) {
return words;
}
}
}
}
}
return words;
}
private void measure(String what, Iteration iteration) {
Consumer<Object> consumer =
o -> {
if (o == null) {
throw new AssertionError();
}
};
// warmup
for (int i = 0; i < 2; i++) {
iteration.run(consumer);
}
List<Long> times = new ArrayList<>();
for (int i = 0; i < 7; i++) {
long start = System.currentTimeMillis();
iteration.run(consumer);
times.add(System.currentTimeMillis() - start);
}
System.out.println(
what
+ ": average "
+ times.stream().mapToLong(Long::longValue).average().orElseThrow()
+ ", all times = "
+ times);
}
private interface Iteration {
void run(Consumer<Object> blackHole);
}
static String stripPunctuation(String token) {
int start = 0;
int end = token.length();
while (start < end && isPunctuation(token.charAt(start))) start++;
while (start < end - 1 && isPunctuation(token.charAt(end - 1))) end--;
return start < end ? token.substring(start, end) : null;
}
private static boolean isPunctuation(char c) {
return ".!?,\"'".indexOf(c) >= 0;
}
}