LUCENE-9811: Hunspell suggestions: speed up ngram calculation by not searching for substrings in impossible places (#2428)

This commit is contained in:
Peter Gromov 2021-02-24 17:41:50 +01:00 committed by GitHub
parent f8be421ae1
commit 9d6fd98810
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 25 additions and 4 deletions

View File

@ -355,12 +355,20 @@ class GeneratingSuggester {
if (l2 == 0) {
return 0;
}
int[] lastStarts = new int[l1];
for (int j = 1; j <= n; j++) {
int ns = 0;
for (int i = 0; i <= (l1 - j); i++) {
if (s2.contains(s1.substring(i, i + j))) {
ns++;
} else if (opt.contains(NGramOptions.WEIGHTED)) {
if (lastStarts[i] >= 0) {
int pos = indexOfSubstring(s2, lastStarts[i], s1, i, j);
lastStarts[i] = pos;
if (pos >= 0) {
ns++;
continue;
}
}
if (opt.contains(NGramOptions.WEIGHTED)) {
ns--;
if (i == 0 || i == l1 - j) {
ns--; // side weight
@ -383,6 +391,19 @@ class GeneratingSuggester {
return score - Math.max(ns, 0);
}
private static int indexOfSubstring(
String haystack, int haystackPos, String needle, int needlePos, int len) {
char c = needle.charAt(needlePos);
int limit = haystack.length() - len;
for (int i = haystackPos; i <= limit; i++) {
if (haystack.charAt(i) == c
&& haystack.regionMatches(i + 1, needle, needlePos + 1, len - 1)) {
return i;
}
}
return -1;
}
private static int lcs(String s1, String s2) {
int[] lengths = new int[s2.length() + 1];

View File

@ -62,7 +62,7 @@ public class TestPerformance extends LuceneTestCase {
@Test
public void en_suggest() throws Exception {
checkSuggestionPerformance("en", 1_000);
checkSuggestionPerformance("en", 1_200);
}
@Test