mirror of https://github.com/apache/lucene.git
Hunspell suggestions: speed up for some non-Latin scripts (#19)
This commit is contained in:
parent
8913a98379
commit
cdff0accaa
|
@ -94,10 +94,12 @@ class GeneratingSuggester {
|
|||
return;
|
||||
}
|
||||
|
||||
int sc =
|
||||
automaton.ngramScore(rootChars)
|
||||
- longerWorsePenalty(word.length(), rootChars.length)
|
||||
+ commonPrefix(word, rootChars);
|
||||
int sc = automaton.ngramScore(rootChars);
|
||||
if (sc == 0) {
|
||||
return; // no common characters at all, don't suggest this root
|
||||
}
|
||||
|
||||
sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);
|
||||
|
||||
if (roots.size() == MAX_ROOTS && sc < roots.peek().score) {
|
||||
return;
|
||||
|
|
|
@ -33,6 +33,7 @@ class TrigramAutomaton {
|
|||
private final CharacterRunAutomaton automaton;
|
||||
private final int[] state2Score;
|
||||
private final FixedBitSet countedSubstrings;
|
||||
private final char minChar;
|
||||
|
||||
TrigramAutomaton(String s1) {
|
||||
Map<String, Integer> substringCounts = new HashMap<>();
|
||||
|
@ -40,6 +41,8 @@ class TrigramAutomaton {
|
|||
Automaton.Builder builder = new Automaton.Builder(s1.length() * N, s1.length() * N);
|
||||
int initialState = builder.createState();
|
||||
|
||||
minChar = (char) s1.chars().min().orElseThrow();
|
||||
|
||||
for (int start = 0; start < s1.length(); start++) {
|
||||
int limit = Math.min(s1.length(), start + N);
|
||||
for (int end = start + 1; end <= limit; end++) {
|
||||
|
@ -49,7 +52,7 @@ class TrigramAutomaton {
|
|||
int state = initialState;
|
||||
for (int i = start; i < limit; i++) {
|
||||
int next = builder.createState();
|
||||
builder.addTransition(state, next, s1.charAt(i));
|
||||
builder.addTransition(state, next, s1.charAt(i) - minChar);
|
||||
state = next;
|
||||
}
|
||||
}
|
||||
|
@ -70,7 +73,7 @@ class TrigramAutomaton {
|
|||
private int runAutomatonOnStringChars(String s) {
|
||||
int state = 0;
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
state = automaton.step(state, s.charAt(i));
|
||||
state = automaton.step(state, s.charAt(i) - minChar);
|
||||
}
|
||||
return state;
|
||||
}
|
||||
|
@ -86,6 +89,11 @@ class TrigramAutomaton {
|
|||
int limit = s2.length + s2.offset;
|
||||
for (int i = s2.offset; i < limit; i++) {
|
||||
char c = transformChar(s2.chars[i]);
|
||||
if (c < minChar) {
|
||||
state1 = state2 = -1;
|
||||
continue;
|
||||
}
|
||||
c -= minChar;
|
||||
|
||||
int state3 = state2 <= 0 ? 0 : automaton.step(state2, c);
|
||||
if (state3 > 0) {
|
||||
|
|
|
@ -65,6 +65,16 @@ public class TestPerformance extends LuceneTestCase {
|
|||
checkSuggestionPerformance("en", 3_000);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void ru() throws Exception {
|
||||
checkAnalysisPerformance("ru", 400_000);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void ru_suggest() throws Exception {
|
||||
checkSuggestionPerformance("ru", 1000);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void de() throws Exception {
|
||||
checkAnalysisPerformance("de", 300_000);
|
||||
|
@ -121,6 +131,7 @@ public class TestPerformance extends LuceneTestCase {
|
|||
Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {});
|
||||
List<String> words =
|
||||
loadWords(code, wordCount, dictionary).stream()
|
||||
.distinct()
|
||||
.filter(w -> hasQuickSuggestions(speller, w))
|
||||
.collect(Collectors.toList());
|
||||
System.out.println("Checking " + words.size() + " misspelled words");
|
||||
|
@ -181,7 +192,8 @@ public class TestPerformance extends LuceneTestCase {
|
|||
String line = reader.readLine();
|
||||
if (line == null) break;
|
||||
|
||||
for (String token : line.split("[^a-zA-Z" + Pattern.quote(dictionary.wordChars) + "]+")) {
|
||||
for (String token :
|
||||
line.split("[^\\p{IsLetter}" + Pattern.quote(dictionary.wordChars) + "]+")) {
|
||||
String word = stripPunctuation(token);
|
||||
if (word != null) {
|
||||
words.add(word);
|
||||
|
|
Loading…
Reference in New Issue