Hunspell suggestions: speed up for some non-Latin scripts (#19)

This commit is contained in:
Peter Gromov 2021-03-15 10:02:45 +01:00 committed by GitHub
parent 8913a98379
commit cdff0accaa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 29 additions and 7 deletions

View File

@ -94,10 +94,12 @@ class GeneratingSuggester {
return;
}
int sc =
automaton.ngramScore(rootChars)
- longerWorsePenalty(word.length(), rootChars.length)
+ commonPrefix(word, rootChars);
int sc = automaton.ngramScore(rootChars);
if (sc == 0) {
return; // no common characters at all, don't suggest this root
}
sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);
if (roots.size() == MAX_ROOTS && sc < roots.peek().score) {
return;

View File

@ -33,6 +33,7 @@ class TrigramAutomaton {
private final CharacterRunAutomaton automaton;
private final int[] state2Score;
private final FixedBitSet countedSubstrings;
private final char minChar;
TrigramAutomaton(String s1) {
Map<String, Integer> substringCounts = new HashMap<>();
@ -40,6 +41,8 @@ class TrigramAutomaton {
Automaton.Builder builder = new Automaton.Builder(s1.length() * N, s1.length() * N);
int initialState = builder.createState();
minChar = (char) s1.chars().min().orElseThrow();
for (int start = 0; start < s1.length(); start++) {
int limit = Math.min(s1.length(), start + N);
for (int end = start + 1; end <= limit; end++) {
@ -49,7 +52,7 @@ class TrigramAutomaton {
int state = initialState;
for (int i = start; i < limit; i++) {
int next = builder.createState();
builder.addTransition(state, next, s1.charAt(i));
builder.addTransition(state, next, s1.charAt(i) - minChar);
state = next;
}
}
@ -70,7 +73,7 @@ class TrigramAutomaton {
private int runAutomatonOnStringChars(String s) {
int state = 0;
for (int i = 0; i < s.length(); i++) {
state = automaton.step(state, s.charAt(i));
state = automaton.step(state, s.charAt(i) - minChar);
}
return state;
}
@ -86,6 +89,11 @@ class TrigramAutomaton {
int limit = s2.length + s2.offset;
for (int i = s2.offset; i < limit; i++) {
char c = transformChar(s2.chars[i]);
if (c < minChar) {
state1 = state2 = -1;
continue;
}
c -= minChar;
int state3 = state2 <= 0 ? 0 : automaton.step(state2, c);
if (state3 > 0) {

View File

@ -65,6 +65,16 @@ public class TestPerformance extends LuceneTestCase {
checkSuggestionPerformance("en", 3_000);
}
@Test
public void ru() throws Exception {
checkAnalysisPerformance("ru", 400_000);
}
@Test
public void ru_suggest() throws Exception {
checkSuggestionPerformance("ru", 1000);
}
@Test
public void de() throws Exception {
checkAnalysisPerformance("de", 300_000);
@ -121,6 +131,7 @@ public class TestPerformance extends LuceneTestCase {
Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {});
List<String> words =
loadWords(code, wordCount, dictionary).stream()
.distinct()
.filter(w -> hasQuickSuggestions(speller, w))
.collect(Collectors.toList());
System.out.println("Checking " + words.size() + " misspelled words");
@ -181,7 +192,8 @@ public class TestPerformance extends LuceneTestCase {
String line = reader.readLine();
if (line == null) break;
for (String token : line.split("[^a-zA-Z" + Pattern.quote(dictionary.wordChars) + "]+")) {
for (String token :
line.split("[^\\p{IsLetter}" + Pattern.quote(dictionary.wordChars) + "]+")) {
String word = stripPunctuation(token);
if (word != null) {
words.add(word);