mirror of https://github.com/apache/lucene.git
hunspell: simplify TrigramAutomaton to speed up the suggestion enumeration (#12491)
* hunspell: simplify TrigramAutomaton to speed up the suggestion enumeration avoid the automaton access on definitely absent characters; count the scores for all substring lengths together
This commit is contained in:
parent
dd4e66dad6
commit
13e747f95f
|
@ -75,7 +75,7 @@ Improvements
|
||||||
|
|
||||||
* LUCENE-10652: Add a top-n range faceting example to RangeFacetsExample. (Yuting Gan)
|
* LUCENE-10652: Add a top-n range faceting example to RangeFacetsExample. (Yuting Gan)
|
||||||
|
|
||||||
* GITHUB#12447: Hunspell: speed up the dictionary enumeration (Peter Gromov)
|
* GITHUB#12447, GITHUB#12491: Hunspell: speed up the dictionary enumeration on suggestion (Peter Gromov)
|
||||||
|
|
||||||
* GITHUB#12464: Hunspell: allow customizing the hash table load factor (Peter Gromov)
|
* GITHUB#12464: Hunspell: allow customizing the hash table load factor (Peter Gromov)
|
||||||
|
|
||||||
|
|
|
@ -32,7 +32,7 @@ class TrigramAutomaton {
|
||||||
private final CharacterRunAutomaton automaton;
|
private final CharacterRunAutomaton automaton;
|
||||||
private final int[] state2Score;
|
private final int[] state2Score;
|
||||||
private final FixedBitSet countedSubstrings;
|
private final FixedBitSet countedSubstrings;
|
||||||
private final char minChar;
|
private final char minChar, maxChar;
|
||||||
|
|
||||||
TrigramAutomaton(String s1) {
|
TrigramAutomaton(String s1) {
|
||||||
Map<String, Integer> substringCounts = new HashMap<>();
|
Map<String, Integer> substringCounts = new HashMap<>();
|
||||||
|
@ -41,6 +41,7 @@ class TrigramAutomaton {
|
||||||
int initialState = builder.createState();
|
int initialState = builder.createState();
|
||||||
|
|
||||||
minChar = (char) s1.chars().min().orElseThrow();
|
minChar = (char) s1.chars().min().orElseThrow();
|
||||||
|
maxChar = (char) s1.chars().max().orElseThrow();
|
||||||
|
|
||||||
for (int start = 0; start < s1.length(); start++) {
|
for (int start = 0; start < s1.length(); start++) {
|
||||||
int limit = Math.min(s1.length(), start + N);
|
int limit = Math.min(s1.length(), start + N);
|
||||||
|
@ -80,7 +81,7 @@ class TrigramAutomaton {
|
||||||
int ngramScore(CharSequence s2) {
|
int ngramScore(CharSequence s2) {
|
||||||
countedSubstrings.clear();
|
countedSubstrings.clear();
|
||||||
|
|
||||||
int score1 = 0, score2 = 0, score3 = 0; // scores for substrings of length 1, 2 and 3
|
int score = 0;
|
||||||
|
|
||||||
// states of running the automaton on substrings [i-1, i) and [i-2, i)
|
// states of running the automaton on substrings [i-1, i) and [i-2, i)
|
||||||
int state1 = -1, state2 = -1;
|
int state1 = -1, state2 = -1;
|
||||||
|
@ -88,7 +89,7 @@ class TrigramAutomaton {
|
||||||
int limit = s2.length();
|
int limit = s2.length();
|
||||||
for (int i = 0; i < limit; i++) {
|
for (int i = 0; i < limit; i++) {
|
||||||
char c = s2.charAt(i);
|
char c = s2.charAt(i);
|
||||||
if (c < minChar) {
|
if (c < minChar || c > maxChar) {
|
||||||
state1 = state2 = -1;
|
state1 = state2 = -1;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -96,27 +97,20 @@ class TrigramAutomaton {
|
||||||
|
|
||||||
int state3 = state2 <= 0 ? 0 : automaton.step(state2, c);
|
int state3 = state2 <= 0 ? 0 : automaton.step(state2, c);
|
||||||
if (state3 > 0) {
|
if (state3 > 0) {
|
||||||
score3 += substringScore(state3, countedSubstrings);
|
score += substringScore(state3, countedSubstrings);
|
||||||
}
|
}
|
||||||
|
|
||||||
state2 = state1 <= 0 ? 0 : automaton.step(state1, c);
|
state2 = state1 <= 0 ? 0 : automaton.step(state1, c);
|
||||||
if (state2 > 0) {
|
if (state2 > 0) {
|
||||||
score2 += substringScore(state2, countedSubstrings);
|
score += substringScore(state2, countedSubstrings);
|
||||||
}
|
}
|
||||||
|
|
||||||
state1 = automaton.step(0, c);
|
state1 = automaton.step(0, c);
|
||||||
if (state1 > 0) {
|
if (state1 > 0) {
|
||||||
score1 += substringScore(state1, countedSubstrings);
|
score += substringScore(state1, countedSubstrings);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int score = score1;
|
|
||||||
if (score1 >= 2) {
|
|
||||||
score += score2;
|
|
||||||
if (score2 >= 2) {
|
|
||||||
score += score3;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return score;
|
return score;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -106,7 +106,7 @@ public class TestPerformance extends LuceneTestCase {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void uk_suggest() throws Exception {
|
public void uk_suggest() throws Exception {
|
||||||
checkSuggestionPerformance("uk", 700);
|
checkSuggestionPerformance("uk", 800);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Dictionary loadDictionary(String code) throws IOException, ParseException {
|
private Dictionary loadDictionary(String code) throws IOException, ParseException {
|
||||||
|
|
Loading…
Reference in New Issue