diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index bea4cf97dc4..450c77da942 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -1025,6 +1025,7 @@ public class Dictionary { assert morphSep > 0; assert morphSep > flagSep; int sep = flagSep < 0 ? morphSep : flagSep; + if (sep == 0) return 0; CharSequence toWrite; String beforeSep = line.substring(0, sep); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java index 7a5cde7480e..0b68bce5c35 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java @@ -23,7 +23,6 @@ import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_STRIP_ORD; import java.io.IOException; import java.util.ArrayList; import java.util.Comparator; -import java.util.EnumSet; import java.util.LinkedHashSet; import java.util.List; import java.util.Objects; @@ -67,7 +66,7 @@ class GeneratingSuggester { PriorityQueue>> roots = new PriorityQueue<>(natural.reversed()); List> entries = new ArrayList<>(); boolean ignoreTitleCaseRoots = originalCase == WordCase.LOWER && !dictionary.hasLanguage("de"); - EnumSet options = EnumSet.of(NGramOptions.LONGER_WORSE); + TrigramAutomaton automaton = new TrigramAutomaton(word); IntsRefFSTEnum fstEnum = new IntsRefFSTEnum<>(dictionary.words); InputOutput mapping; @@ -75,6 +74,7 @@ class GeneratingSuggester { speller.checkCanceled.run(); IntsRef key = mapping.input; + assert key.length > 0; if (Math.abs(key.length - word.length()) > MAX_ROOT_LENGTH_DIFF) { assert key.length < word.length(); // nextKey takes care of longer keys continue; @@ -89,7 +89,8 @@ class GeneratingSuggester { } String lower = dictionary.toLowerCase(root); - int sc = ngram(3, word, lower, options) + commonPrefix(word, root); + int sc = + automaton.ngramScore(lower) - longerWorsePenalty(word, lower) + commonPrefix(word, root); if (roots.size() == MAX_ROOTS && sc < roots.peek().score) { continue; @@ -152,7 +153,7 @@ class GeneratingSuggester { for (String guess : expandRoot(weighted.word, misspelled)) { String lower = dictionary.toLowerCase(guess); int sc = - ngram(misspelled.length(), misspelled, lower, EnumSet.of(NGramOptions.ANY_MISMATCH)) + anyMismatchNgram(misspelled.length(), misspelled, lower, false) + commonPrefix(misspelled, guess); if (sc > thresh) { expanded.add(new Weighted<>(guess, sc)); @@ -173,7 +174,7 @@ class GeneratingSuggester { mw[k] = '*'; } - thresh += ngram(word.length(), word, new String(mw), EnumSet.of(NGramOptions.ANY_MISMATCH)); + thresh += anyMismatchNgram(word.length(), word, new String(mw), false); } return thresh / 3 - 1; } @@ -314,16 +315,14 @@ class GeneratingSuggester { break; } - int re = - ngram(2, word, lower, EnumSet.of(NGramOptions.ANY_MISMATCH, NGramOptions.WEIGHTED)) - + ngram(2, lower, word, EnumSet.of(NGramOptions.ANY_MISMATCH, NGramOptions.WEIGHTED)); + int re = anyMismatchNgram(2, word, lower, true) + anyMismatchNgram(2, lower, word, true); int score = 2 * lcs(word, lower) - Math.abs(word.length() - lower.length()) + commonCharacterPositionScore(word, lower) + commonPrefix(word, lower) - + ngram(4, word, lower, EnumSet.of(NGramOptions.ANY_MISMATCH)) + + anyMismatchNgram(4, word, lower, false) + re + (re < (word.length() + lower.length()) * fact ? -1000 : 0); bySimilarity.add(new Weighted<>(guess, score)); @@ -374,14 +373,9 @@ class GeneratingSuggester { } // generate an n-gram score comparing s1 and s2 - private static int ngram(int n, String s1, String s2, EnumSet opt) { - int score = 0; + static int ngramScore(int n, String s1, String s2, boolean weighted) { int l1 = s1.length(); - int l2 = s2.length(); - if (l2 == 0) { - return 0; - } - + int score = 0; int[] lastStarts = new int[l1]; for (int j = 1; j <= n; j++) { int ns = 0; @@ -394,7 +388,7 @@ class GeneratingSuggester { continue; } } - if (opt.contains(NGramOptions.WEIGHTED)) { + if (weighted) { ns--; if (i == 0 || i == l1 - j) { ns--; // side weight @@ -402,19 +396,21 @@ class GeneratingSuggester { } } score = score + ns; - if (ns < 2 && !opt.contains(NGramOptions.WEIGHTED)) { + if (ns < 2 && !weighted) { break; } } + return score; + } - int ns = 0; - if (opt.contains(NGramOptions.LONGER_WORSE)) { - ns = (l2 - l1) - 2; - } - if (opt.contains(NGramOptions.ANY_MISMATCH)) { - ns = Math.abs(l2 - l1) - 2; - } - return score - Math.max(ns, 0); + // NGRAM_LONGER_WORSE flag in Hunspell + private static int longerWorsePenalty(String s1, String s2) { + return Math.max((s2.length() - s1.length()) - 2, 0); + } + + // NGRAM_ANY_MISMATCH flag in Hunspell + private static int anyMismatchNgram(int n, String s1, String s2, boolean weighted) { + return ngramScore(n, s1, s2, weighted) - Math.max(Math.abs(s2.length() - s1.length()) - 2, 0); } private static int indexOfSubstring( @@ -471,12 +467,6 @@ class GeneratingSuggester { return commonScore; } - private enum NGramOptions { - WEIGHTED, - LONGER_WORSE, - ANY_MISMATCH - } - private static class Weighted> implements Comparable> { final T word; final int score; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java new file mode 100644 index 00000000000..4235a1b53c8 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +import java.util.HashMap; +import java.util.Map; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; +import org.apache.lucene.util.automaton.Operations; + +/** + * An automaton allowing to achieve the same results as non-weighted {@link + * GeneratingSuggester#ngramScore}, but faster (in O(s2.length) time). + */ +class TrigramAutomaton { + private static final int N = 3; + private final CharacterRunAutomaton automaton; + private final int[] state2Score; + private final FixedBitSet countedSubstrings; + + TrigramAutomaton(String s1) { + Map substringCounts = new HashMap<>(); + + Automaton.Builder builder = new Automaton.Builder(s1.length() * N, s1.length() * N); + int initialState = builder.createState(); + + for (int start = 0; start < s1.length(); start++) { + int limit = Math.min(s1.length(), start + N); + for (int end = start + 1; end <= limit; end++) { + substringCounts.merge(s1.substring(start, end), 1, Integer::sum); + } + + int state = initialState; + for (int i = start; i < limit; i++) { + int next = builder.createState(); + builder.addTransition(state, next, s1.charAt(i)); + state = next; + } + } + + automaton = + new CharacterRunAutomaton( + Operations.determinize(builder.finish(), Operations.DEFAULT_MAX_DETERMINIZED_STATES)); + + state2Score = new int[automaton.getSize()]; + for (Map.Entry entry : substringCounts.entrySet()) { + int state = runAutomatonOnStringChars(entry.getKey()); + assert state2Score[state] == 0; + state2Score[state] = entry.getValue(); + } + countedSubstrings = new FixedBitSet(state2Score.length); + } + + private int runAutomatonOnStringChars(String s) { + int state = 0; + for (int i = 0; i < s.length(); i++) { + state = automaton.step(state, s.charAt(i)); + } + return state; + } + + int ngramScore(String s2) { + countedSubstrings.clear(0, countedSubstrings.length()); + + int score1 = 0, score2 = 0, score3 = 0; // scores for substrings of length 1, 2 and 3 + + // states of running the automaton on substrings [i-1, i) and [i-2, i) + int state1 = -1, state2 = -1; + + int length = s2.length(); + for (int i = 0; i < length; i++) { + char c = s2.charAt(i); + + int state3 = state2 <= 0 ? 0 : automaton.step(state2, c); + if (state3 > 0) { + score3 += substringScore(state3, countedSubstrings); + } + + state2 = state1 <= 0 ? 0 : automaton.step(state1, c); + if (state2 > 0) { + score2 += substringScore(state2, countedSubstrings); + } + + state1 = automaton.step(0, c); + if (state1 > 0) { + score1 += substringScore(state1, countedSubstrings); + } + } + + int score = score1; + if (score1 >= 2) { + score += score2; + if (score2 >= 2) { + score += score3; + } + } + return score; + } + + private int substringScore(int state, FixedBitSet countedSubstrings) { + if (countedSubstrings.getAndSet(state)) return 0; + + int score = state2Score[state]; + assert score > 0; + return score; + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestTrigramAutomaton.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestTrigramAutomaton.java new file mode 100644 index 00000000000..60e708e06be --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestTrigramAutomaton.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +import java.util.stream.Collectors; +import org.apache.lucene.util.LuceneTestCase; + +public class TestTrigramAutomaton extends LuceneTestCase { + public void testSameScore() { + checkScores("look", "looked"); + checkScores("look", "cool"); + checkScores("abracadabra", "abraham"); + } + + public void testRandomized() { + String[] alphabet = { + "a", + "b", + "c", + "aa", + "ab", + "abc", + "ccc", + "\uD800\uDFD1", + "\uD800\uDFD2", + "\uD800\uDFD2\uD800\uDFD1", + "\uD800\uDFD2\uD800\uDFD2" + }; + for (int i = 0; i < 100; i++) { + checkScores(randomConcatenation(alphabet), randomConcatenation(alphabet)); + } + } + + private String randomConcatenation(String[] alphabet) { + return random() + .ints(0, alphabet.length) + .limit(random().nextInt(20) + 1) + .mapToObj(i -> alphabet[i]) + .collect(Collectors.joining()); + } + + private void checkScores(String s1, String s2) { + String message = "Fails: checkScores(\"" + s1 + "\", \"" + s2 + "\")"; + assertEquals( + message, + GeneratingSuggester.ngramScore(3, s1, s2, false), + new TrigramAutomaton(s1).ngramScore(s2)); + assertEquals( + message, + GeneratingSuggester.ngramScore(3, s2, s1, false), + new TrigramAutomaton(s2).ngramScore(s1)); + } +}