mirror of
https://github.com/apache/lucene.git
synced 2025-02-28 21:39:25 +00:00
LUCENE-9811: Hunspell suggestions: speed up ngram calculation by not searching for substrings in impossible places (#2428)
This commit is contained in:
parent
f8be421ae1
commit
9d6fd98810
@ -355,12 +355,20 @@ class GeneratingSuggester {
|
||||
if (l2 == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int[] lastStarts = new int[l1];
|
||||
for (int j = 1; j <= n; j++) {
|
||||
int ns = 0;
|
||||
for (int i = 0; i <= (l1 - j); i++) {
|
||||
if (s2.contains(s1.substring(i, i + j))) {
|
||||
ns++;
|
||||
} else if (opt.contains(NGramOptions.WEIGHTED)) {
|
||||
if (lastStarts[i] >= 0) {
|
||||
int pos = indexOfSubstring(s2, lastStarts[i], s1, i, j);
|
||||
lastStarts[i] = pos;
|
||||
if (pos >= 0) {
|
||||
ns++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (opt.contains(NGramOptions.WEIGHTED)) {
|
||||
ns--;
|
||||
if (i == 0 || i == l1 - j) {
|
||||
ns--; // side weight
|
||||
@ -383,6 +391,19 @@ class GeneratingSuggester {
|
||||
return score - Math.max(ns, 0);
|
||||
}
|
||||
|
||||
private static int indexOfSubstring(
|
||||
String haystack, int haystackPos, String needle, int needlePos, int len) {
|
||||
char c = needle.charAt(needlePos);
|
||||
int limit = haystack.length() - len;
|
||||
for (int i = haystackPos; i <= limit; i++) {
|
||||
if (haystack.charAt(i) == c
|
||||
&& haystack.regionMatches(i + 1, needle, needlePos + 1, len - 1)) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
private static int lcs(String s1, String s2) {
|
||||
int[] lengths = new int[s2.length() + 1];
|
||||
|
||||
|
@ -62,7 +62,7 @@ public class TestPerformance extends LuceneTestCase {
|
||||
|
||||
@Test
|
||||
public void en_suggest() throws Exception {
|
||||
checkSuggestionPerformance("en", 1_000);
|
||||
checkSuggestionPerformance("en", 1_200);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
Loading…
x
Reference in New Issue
Block a user