hunspell: simplify TrigramAutomaton to speed up the suggestion enumeration (#12491)

* hunspell: simplify TrigramAutomaton to speed up the suggestion enumeration

avoid the automaton access on definitely absent characters;
count the scores for all substring lengths together
This commit is contained in:
Peter Gromov 2023-08-08 22:40:42 +02:00 committed by GitHub
parent dd4e66dad6
commit 13e747f95f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 9 additions and 15 deletions

View File

@ -75,7 +75,7 @@ Improvements
* LUCENE-10652: Add a top-n range faceting example to RangeFacetsExample. (Yuting Gan)
* GITHUB#12447: Hunspell: speed up the dictionary enumeration (Peter Gromov)
* GITHUB#12447, GITHUB#12491: Hunspell: speed up the dictionary enumeration on suggestion (Peter Gromov)
* GITHUB#12464: Hunspell: allow customizing the hash table load factor (Peter Gromov)

View File

@ -32,7 +32,7 @@ class TrigramAutomaton {
private final CharacterRunAutomaton automaton;
private final int[] state2Score;
private final FixedBitSet countedSubstrings;
private final char minChar;
private final char minChar, maxChar;
TrigramAutomaton(String s1) {
Map<String, Integer> substringCounts = new HashMap<>();
@ -41,6 +41,7 @@ class TrigramAutomaton {
int initialState = builder.createState();
minChar = (char) s1.chars().min().orElseThrow();
maxChar = (char) s1.chars().max().orElseThrow();
for (int start = 0; start < s1.length(); start++) {
int limit = Math.min(s1.length(), start + N);
@ -80,7 +81,7 @@ class TrigramAutomaton {
int ngramScore(CharSequence s2) {
countedSubstrings.clear();
int score1 = 0, score2 = 0, score3 = 0; // scores for substrings of length 1, 2 and 3
int score = 0;
// states of running the automaton on substrings [i-1, i) and [i-2, i)
int state1 = -1, state2 = -1;
@ -88,7 +89,7 @@ class TrigramAutomaton {
int limit = s2.length();
for (int i = 0; i < limit; i++) {
char c = s2.charAt(i);
if (c < minChar) {
if (c < minChar || c > maxChar) {
state1 = state2 = -1;
continue;
}
@ -96,27 +97,20 @@ class TrigramAutomaton {
int state3 = state2 <= 0 ? 0 : automaton.step(state2, c);
if (state3 > 0) {
score3 += substringScore(state3, countedSubstrings);
score += substringScore(state3, countedSubstrings);
}
state2 = state1 <= 0 ? 0 : automaton.step(state1, c);
if (state2 > 0) {
score2 += substringScore(state2, countedSubstrings);
score += substringScore(state2, countedSubstrings);
}
state1 = automaton.step(0, c);
if (state1 > 0) {
score1 += substringScore(state1, countedSubstrings);
score += substringScore(state1, countedSubstrings);
}
}
int score = score1;
if (score1 >= 2) {
score += score2;
if (score2 >= 2) {
score += score3;
}
}
return score;
}

View File

@ -106,7 +106,7 @@ public class TestPerformance extends LuceneTestCase {
@Test
public void uk_suggest() throws Exception {
checkSuggestionPerformance("uk", 700);
checkSuggestionPerformance("uk", 800);
}
private Dictionary loadDictionary(String code) throws IOException, ParseException {