mirror of https://github.com/apache/lucene.git
LUCENE-9728: Hunspell: add a performance test (#2296)
This commit is contained in:
parent
650f16ad5d
commit
82f8d7ba1d
|
@ -94,6 +94,7 @@ grant {
|
|||
// Some Hunspell tests may read from external files specified in system properties
|
||||
permission java.io.FilePermission "${hunspell.repo.path}${/}-", "read";
|
||||
permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read";
|
||||
permission java.io.FilePermission "${hunspell.corpora}${/}-", "read";
|
||||
};
|
||||
|
||||
// Permissions to support ant build
|
||||
|
|
|
@ -109,6 +109,8 @@ public class Dictionary {
|
|||
char[] stripData;
|
||||
int[] stripOffsets;
|
||||
|
||||
String wordChars = "";
|
||||
|
||||
// 4 chars per affix, each char representing an unsigned 2-byte integer
|
||||
char[] affixData = new char[32];
|
||||
private int currentAffix = 0;
|
||||
|
@ -384,11 +386,12 @@ public class Dictionary {
|
|||
fullStrip = true;
|
||||
} else if ("LANG".equals(firstWord)) {
|
||||
language = singleArgument(reader, line);
|
||||
int underscore = language.indexOf("_");
|
||||
String langCode = underscore < 0 ? language : language.substring(0, underscore);
|
||||
String langCode = extractLanguageCode(language);
|
||||
alternateCasing = langCode.equals("tr") || langCode.equals("az");
|
||||
} else if ("BREAK".equals(firstWord)) {
|
||||
breaks = parseBreaks(reader, line);
|
||||
} else if ("WORDCHARS".equals(firstWord)) {
|
||||
wordChars = singleArgument(reader, line);
|
||||
} else if ("TRY".equals(firstWord)) {
|
||||
tryChars = singleArgument(reader, line);
|
||||
} else if ("REP".equals(firstWord)) {
|
||||
|
@ -460,6 +463,11 @@ public class Dictionary {
|
|||
stripOffsets[currentIndex] = currentOffset;
|
||||
}
|
||||
|
||||
static String extractLanguageCode(String isoCode) {
|
||||
int underscore = isoCode.indexOf("_");
|
||||
return underscore < 0 ? isoCode : isoCode.substring(0, underscore);
|
||||
}
|
||||
|
||||
private String singleArgument(LineNumberReader reader, String line) throws ParseException {
|
||||
return splitBySpace(reader, line, 2)[1];
|
||||
}
|
||||
|
|
|
@ -21,8 +21,8 @@ import java.io.InputStream;
|
|||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.text.ParseException;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.lucene.store.BaseDirectoryWrapper;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
|
||||
|
@ -41,15 +41,13 @@ import org.junit.Ignore;
|
|||
@SuppressSysoutChecks(bugUrl = "prints important memory utilization stats per dictionary")
|
||||
public class TestAllDictionaries extends LuceneTestCase {
|
||||
|
||||
private static List<Path> findAllAffixFiles() throws IOException {
|
||||
static Stream<Path> findAllAffixFiles() throws IOException {
|
||||
String dicDir = System.getProperty("hunspell.dictionaries");
|
||||
Assume.assumeFalse("Missing -Dhunspell.dictionaries=...", dicDir == null);
|
||||
return Files.walk(Path.of(dicDir), 2)
|
||||
.filter(f -> f.toString().endsWith(".aff"))
|
||||
.collect(Collectors.toList());
|
||||
return Files.walk(Path.of(dicDir), 2).filter(f -> f.toString().endsWith(".aff"));
|
||||
}
|
||||
|
||||
private static Dictionary loadDictionary(Path aff) throws IOException, ParseException {
|
||||
static Dictionary loadDictionary(Path aff) throws IOException, ParseException {
|
||||
String affPath = aff.toString();
|
||||
Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic");
|
||||
assert Files.exists(dic) : dic;
|
||||
|
@ -62,7 +60,7 @@ public class TestAllDictionaries extends LuceneTestCase {
|
|||
|
||||
public void testDictionariesLoadSuccessfully() throws Exception {
|
||||
int failures = 0;
|
||||
for (Path aff : findAllAffixFiles()) {
|
||||
for (Path aff : findAllAffixFiles().collect(Collectors.toList())) {
|
||||
try {
|
||||
System.out.println(aff + "\t" + memoryUsage(loadDictionary(aff)));
|
||||
} catch (Throwable e) {
|
||||
|
|
|
@ -0,0 +1,168 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.annotations.TestCaseOrdering;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.regex.Pattern;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Assume;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* A test that runs various Hunspell APIs on real dictionaries and relatively large corpora for
|
||||
* specific languages and prints the execution times. The dictionaries should be set up as in {@link
|
||||
* TestAllDictionaries}, the corpora should be in files named {@code langCode.txt} (e.g. {@code
|
||||
* en.txt}) in a directory specified in {@code -Dhunspell.corpora=...}
|
||||
*/
|
||||
@TestCaseOrdering(TestCaseOrdering.AlphabeticOrder.class)
|
||||
@Ignore("enable manually")
|
||||
public class TestPerformance extends LuceneTestCase {
|
||||
|
||||
@Test
|
||||
public void en() throws Exception {
|
||||
checkPerformance("en", 500_000);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void de() throws Exception {
|
||||
checkPerformance("de", 100_000);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void fr() throws Exception {
|
||||
checkPerformance("fr", 20_000);
|
||||
}
|
||||
|
||||
private void checkPerformance(String code, int wordCount) throws Exception {
|
||||
Path aff = findAffFile(code);
|
||||
Dictionary dictionary = TestAllDictionaries.loadDictionary(aff);
|
||||
System.out.println("Loaded " + aff);
|
||||
|
||||
List<String> words = loadWords(code, wordCount, dictionary);
|
||||
|
||||
Stemmer stemmer = new Stemmer(dictionary);
|
||||
SpellChecker speller = new SpellChecker(dictionary);
|
||||
measure(
|
||||
"Stemming " + code,
|
||||
blackHole -> {
|
||||
for (String word : words) {
|
||||
blackHole.accept(stemmer.stem(word));
|
||||
}
|
||||
});
|
||||
measure(
|
||||
"Spellchecking " + code,
|
||||
blackHole -> {
|
||||
for (String word : words) {
|
||||
blackHole.accept(speller.spell(word));
|
||||
}
|
||||
});
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
private Path findAffFile(String code) throws IOException {
|
||||
return TestAllDictionaries.findAllAffixFiles()
|
||||
.filter(
|
||||
path -> {
|
||||
String parentName = path.getParent().getFileName().toString();
|
||||
return code.equals(Dictionary.extractLanguageCode(parentName));
|
||||
})
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new IllegalArgumentException("Cannot find aff/dic for " + code));
|
||||
}
|
||||
|
||||
private List<String> loadWords(String code, int wordCount, Dictionary dictionary)
|
||||
throws IOException {
|
||||
String corpusDir = System.getProperty("hunspell.corpora");
|
||||
Assume.assumeFalse("", corpusDir == null);
|
||||
|
||||
Path dataPath = Path.of(corpusDir).resolve(code + ".txt");
|
||||
List<String> words = new ArrayList<>();
|
||||
try (InputStream stream = Files.newInputStream(dataPath)) {
|
||||
BufferedReader reader =
|
||||
new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
|
||||
while (true) {
|
||||
String line = reader.readLine();
|
||||
if (line == null) break;
|
||||
|
||||
for (String token : line.split("[^a-zA-Z" + Pattern.quote(dictionary.wordChars) + "]+")) {
|
||||
String word = stripPunctuation(token);
|
||||
if (word != null) {
|
||||
words.add(word);
|
||||
if (words.size() == wordCount) {
|
||||
return words;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return words;
|
||||
}
|
||||
|
||||
private void measure(String what, Iteration iteration) {
|
||||
Consumer<Object> consumer =
|
||||
o -> {
|
||||
if (o == null) {
|
||||
throw new AssertionError();
|
||||
}
|
||||
};
|
||||
|
||||
// warmup
|
||||
for (int i = 0; i < 2; i++) {
|
||||
iteration.run(consumer);
|
||||
}
|
||||
|
||||
List<Long> times = new ArrayList<>();
|
||||
for (int i = 0; i < 7; i++) {
|
||||
long start = System.currentTimeMillis();
|
||||
iteration.run(consumer);
|
||||
times.add(System.currentTimeMillis() - start);
|
||||
}
|
||||
System.out.println(
|
||||
what
|
||||
+ ": average "
|
||||
+ times.stream().mapToLong(Long::longValue).average().orElseThrow()
|
||||
+ ", all times = "
|
||||
+ times);
|
||||
}
|
||||
|
||||
private interface Iteration {
|
||||
void run(Consumer<Object> blackHole);
|
||||
}
|
||||
|
||||
static String stripPunctuation(String token) {
|
||||
int start = 0;
|
||||
int end = token.length();
|
||||
while (start < end && isPunctuation(token.charAt(start))) start++;
|
||||
while (start < end - 1 && isPunctuation(token.charAt(end - 1))) end--;
|
||||
return start < end ? token.substring(start, end) : null;
|
||||
}
|
||||
|
||||
private static boolean isPunctuation(char c) {
|
||||
return ".!?,\"'’‘".indexOf(c) >= 0;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue