From 9d6fd98810aff7311e124be98f8c8f8cc66520be Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Wed, 24 Feb 2021 17:41:50 +0100 Subject: [PATCH] LUCENE-9811: Hunspell suggestions: speed up ngram calculation by not searching for substrings in impossible places (#2428) --- .../hunspell/GeneratingSuggester.java | 27 ++++++++++++++++--- .../analysis/hunspell/TestPerformance.java | 2 +- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java index c1bbdf9e555..cb5286132fc 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java @@ -355,12 +355,20 @@ class GeneratingSuggester { if (l2 == 0) { return 0; } + + int[] lastStarts = new int[l1]; for (int j = 1; j <= n; j++) { int ns = 0; for (int i = 0; i <= (l1 - j); i++) { - if (s2.contains(s1.substring(i, i + j))) { - ns++; - } else if (opt.contains(NGramOptions.WEIGHTED)) { + if (lastStarts[i] >= 0) { + int pos = indexOfSubstring(s2, lastStarts[i], s1, i, j); + lastStarts[i] = pos; + if (pos >= 0) { + ns++; + continue; + } + } + if (opt.contains(NGramOptions.WEIGHTED)) { ns--; if (i == 0 || i == l1 - j) { ns--; // side weight @@ -383,6 +391,19 @@ class GeneratingSuggester { return score - Math.max(ns, 0); } + private static int indexOfSubstring( + String haystack, int haystackPos, String needle, int needlePos, int len) { + char c = needle.charAt(needlePos); + int limit = haystack.length() - len; + for (int i = haystackPos; i <= limit; i++) { + if (haystack.charAt(i) == c + && haystack.regionMatches(i + 1, needle, needlePos + 1, len - 1)) { + return i; + } + } + return -1; + } + private static int lcs(String s1, String s2) { int[] lengths = new int[s2.length() + 1]; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java index 2dfa2a8cdbc..eda0f7659fe 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java @@ -62,7 +62,7 @@ public class TestPerformance extends LuceneTestCase { @Test public void en_suggest() throws Exception { - checkSuggestionPerformance("en", 1_000); + checkSuggestionPerformance("en", 1_200); } @Test