From 13e747f95f7b8277cc961e83b4990e6d152dd351 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Tue, 8 Aug 2023 22:40:42 +0200 Subject: [PATCH] hunspell: simplify TrigramAutomaton to speed up the suggestion enumeration (#12491) * hunspell: simplify TrigramAutomaton to speed up the suggestion enumeration avoid the automaton access on definitely absent characters; count the scores for all substring lengths together --- lucene/CHANGES.txt | 2 +- .../analysis/hunspell/TrigramAutomaton.java | 20 +++++++------------ .../analysis/hunspell/TestPerformance.java | 2 +- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index b3a7df1b5be..13fd71a2b88 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -75,7 +75,7 @@ Improvements * LUCENE-10652: Add a top-n range faceting example to RangeFacetsExample. (Yuting Gan) -* GITHUB#12447: Hunspell: speed up the dictionary enumeration (Peter Gromov) +* GITHUB#12447, GITHUB#12491: Hunspell: speed up the dictionary enumeration on suggestion (Peter Gromov) * GITHUB#12464: Hunspell: allow customizing the hash table load factor (Peter Gromov) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java index 2e6091541d2..06d528fc001 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java @@ -32,7 +32,7 @@ class TrigramAutomaton { private final CharacterRunAutomaton automaton; private final int[] state2Score; private final FixedBitSet countedSubstrings; - private final char minChar; + private final char minChar, maxChar; TrigramAutomaton(String s1) { Map substringCounts = new HashMap<>(); @@ -41,6 +41,7 @@ class TrigramAutomaton { int initialState = builder.createState(); minChar = (char) s1.chars().min().orElseThrow(); + maxChar = (char) s1.chars().max().orElseThrow(); for (int start = 0; start < s1.length(); start++) { int limit = Math.min(s1.length(), start + N); @@ -80,7 +81,7 @@ class TrigramAutomaton { int ngramScore(CharSequence s2) { countedSubstrings.clear(); - int score1 = 0, score2 = 0, score3 = 0; // scores for substrings of length 1, 2 and 3 + int score = 0; // states of running the automaton on substrings [i-1, i) and [i-2, i) int state1 = -1, state2 = -1; @@ -88,7 +89,7 @@ class TrigramAutomaton { int limit = s2.length(); for (int i = 0; i < limit; i++) { char c = s2.charAt(i); - if (c < minChar) { + if (c < minChar || c > maxChar) { state1 = state2 = -1; continue; } @@ -96,27 +97,20 @@ class TrigramAutomaton { int state3 = state2 <= 0 ? 0 : automaton.step(state2, c); if (state3 > 0) { - score3 += substringScore(state3, countedSubstrings); + score += substringScore(state3, countedSubstrings); } state2 = state1 <= 0 ? 0 : automaton.step(state1, c); if (state2 > 0) { - score2 += substringScore(state2, countedSubstrings); + score += substringScore(state2, countedSubstrings); } state1 = automaton.step(0, c); if (state1 > 0) { - score1 += substringScore(state1, countedSubstrings); + score += substringScore(state1, countedSubstrings); } } - int score = score1; - if (score1 >= 2) { - score += score2; - if (score2 >= 2) { - score += score3; - } - } return score; } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java index 232f12e13c1..04989954591 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java @@ -106,7 +106,7 @@ public class TestPerformance extends LuceneTestCase { @Test public void uk_suggest() throws Exception { - checkSuggestionPerformance("uk", 700); + checkSuggestionPerformance("uk", 800); } private Dictionary loadDictionary(String code) throws IOException, ParseException {