mirror of https://github.com/apache/lucene.git
hunspell: simplify TrigramAutomaton to speed up the suggestion enumeration (#12491)
* hunspell: simplify TrigramAutomaton to speed up the suggestion enumeration avoid the automaton access on definitely absent characters; count the scores for all substring lengths together
This commit is contained in:
parent
dd4e66dad6
commit
13e747f95f
|
@ -75,7 +75,7 @@ Improvements
|
|||
|
||||
* LUCENE-10652: Add a top-n range faceting example to RangeFacetsExample. (Yuting Gan)
|
||||
|
||||
* GITHUB#12447: Hunspell: speed up the dictionary enumeration (Peter Gromov)
|
||||
* GITHUB#12447, GITHUB#12491: Hunspell: speed up the dictionary enumeration on suggestion (Peter Gromov)
|
||||
|
||||
* GITHUB#12464: Hunspell: allow customizing the hash table load factor (Peter Gromov)
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ class TrigramAutomaton {
|
|||
private final CharacterRunAutomaton automaton;
|
||||
private final int[] state2Score;
|
||||
private final FixedBitSet countedSubstrings;
|
||||
private final char minChar;
|
||||
private final char minChar, maxChar;
|
||||
|
||||
TrigramAutomaton(String s1) {
|
||||
Map<String, Integer> substringCounts = new HashMap<>();
|
||||
|
@ -41,6 +41,7 @@ class TrigramAutomaton {
|
|||
int initialState = builder.createState();
|
||||
|
||||
minChar = (char) s1.chars().min().orElseThrow();
|
||||
maxChar = (char) s1.chars().max().orElseThrow();
|
||||
|
||||
for (int start = 0; start < s1.length(); start++) {
|
||||
int limit = Math.min(s1.length(), start + N);
|
||||
|
@ -80,7 +81,7 @@ class TrigramAutomaton {
|
|||
int ngramScore(CharSequence s2) {
|
||||
countedSubstrings.clear();
|
||||
|
||||
int score1 = 0, score2 = 0, score3 = 0; // scores for substrings of length 1, 2 and 3
|
||||
int score = 0;
|
||||
|
||||
// states of running the automaton on substrings [i-1, i) and [i-2, i)
|
||||
int state1 = -1, state2 = -1;
|
||||
|
@ -88,7 +89,7 @@ class TrigramAutomaton {
|
|||
int limit = s2.length();
|
||||
for (int i = 0; i < limit; i++) {
|
||||
char c = s2.charAt(i);
|
||||
if (c < minChar) {
|
||||
if (c < minChar || c > maxChar) {
|
||||
state1 = state2 = -1;
|
||||
continue;
|
||||
}
|
||||
|
@ -96,27 +97,20 @@ class TrigramAutomaton {
|
|||
|
||||
int state3 = state2 <= 0 ? 0 : automaton.step(state2, c);
|
||||
if (state3 > 0) {
|
||||
score3 += substringScore(state3, countedSubstrings);
|
||||
score += substringScore(state3, countedSubstrings);
|
||||
}
|
||||
|
||||
state2 = state1 <= 0 ? 0 : automaton.step(state1, c);
|
||||
if (state2 > 0) {
|
||||
score2 += substringScore(state2, countedSubstrings);
|
||||
score += substringScore(state2, countedSubstrings);
|
||||
}
|
||||
|
||||
state1 = automaton.step(0, c);
|
||||
if (state1 > 0) {
|
||||
score1 += substringScore(state1, countedSubstrings);
|
||||
score += substringScore(state1, countedSubstrings);
|
||||
}
|
||||
}
|
||||
|
||||
int score = score1;
|
||||
if (score1 >= 2) {
|
||||
score += score2;
|
||||
if (score2 >= 2) {
|
||||
score += score3;
|
||||
}
|
||||
}
|
||||
return score;
|
||||
}
|
||||
|
||||
|
|
|
@ -106,7 +106,7 @@ public class TestPerformance extends LuceneTestCase {
|
|||
|
||||
@Test
|
||||
public void uk_suggest() throws Exception {
|
||||
checkSuggestionPerformance("uk", 700);
|
||||
checkSuggestionPerformance("uk", 800);
|
||||
}
|
||||
|
||||
private Dictionary loadDictionary(String code) throws IOException, ParseException {
|
||||
|
|
Loading…
Reference in New Issue