From 92a20c24e93915a79ef53391a70cf2fb35cb1e93 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Tue, 15 Mar 2022 18:43:56 +0100 Subject: [PATCH] LUCENE-10451 Hunspell: don't perform potentially expensive spellchecking after timeout (#721) move all expensive operations closer to the suggestion creation, encapsulate case and output conversion into a new Suggestion class --- .../hunspell/GeneratingSuggester.java | 6 +- .../lucene/analysis/hunspell/Hunspell.java | 80 ++++++------------ .../analysis/hunspell/ModifyingSuggester.java | 83 +++++++++++-------- .../lucene/analysis/hunspell/Suggestion.java | 77 +++++++++++++++++ 4 files changed, 157 insertions(+), 89 deletions(-) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Suggestion.java diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java index cc72027a331..f7ffdc3b6dd 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java @@ -52,7 +52,7 @@ class GeneratingSuggester { this.speller = speller; } - List suggest(String word, WordCase originalCase, Set prevSuggestions) { + List suggest(String word, WordCase originalCase, Set prevSuggestions) { List>> roots = findSimilarDictionaryEntries(word, originalCase); List> expanded = expandRoots(word, roots); TreeSet> bySimilarity = rankBySimilarity(word, expanded); @@ -331,7 +331,7 @@ class GeneratingSuggester { } private List getMostRelevantSuggestions( - TreeSet> bySimilarity, Set prevSuggestions) { + TreeSet> bySimilarity, Set prevSuggestions) { List result = new ArrayList<>(); boolean hasExcellent = false; for (Weighted weighted : bySimilarity) { @@ -347,7 +347,7 @@ class GeneratingSuggester { break; } - if (prevSuggestions.stream().noneMatch(weighted.word::contains) + if (prevSuggestions.stream().noneMatch(s -> weighted.word.contains(s.raw)) && result.stream().noneMatch(weighted.word::contains) && speller.checkWord(weighted.word)) { result.add(weighted.word); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java index 067564998e2..27eebc6e3ec 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java @@ -17,7 +17,8 @@ package org.apache.lucene.analysis.hunspell; import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET; -import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.*; +import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.NO_TIMEOUT; +import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.RETURN_PARTIAL_RESULT; import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN; import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END; import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE; @@ -25,11 +26,11 @@ import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END; import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.LinkedHashSet; import java.util.List; -import java.util.Locale; import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; @@ -543,25 +544,25 @@ public class Hunspell { } } - LinkedHashSet suggestions = new LinkedHashSet<>(); + LinkedHashSet suggestions = new LinkedHashSet<>(); Runnable checkCanceled = - policy == NO_TIMEOUT - ? this.checkCanceled - : checkTimeLimit(word, wordCase, suggestions, timeLimitMs); + policy == NO_TIMEOUT ? this.checkCanceled : checkTimeLimit(word, suggestions, timeLimitMs); try { doSuggest(word, wordCase, suggestions, checkCanceled); } catch (SuggestionTimeoutException e) { - if (policy == RETURN_PARTIAL_RESULT) { - return postprocess(word, wordCase, suggestions); + if (policy != RETURN_PARTIAL_RESULT) { + throw e; } - throw e; } - return postprocess(word, wordCase, suggestions); + return postprocess(suggestions); } private void doSuggest( - String word, WordCase wordCase, LinkedHashSet suggestions, Runnable checkCanceled) { + String word, + WordCase wordCase, + LinkedHashSet suggestions, + Runnable checkCanceled) { Hunspell suggestionSpeller = new Hunspell(dictionary, policy, checkCanceled) { @Override @@ -570,22 +571,26 @@ public class Hunspell { && !dictionary.hasFlag(formID, dictionary.subStandard); } }; - ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller, suggestions); - boolean hasGoodSuggestions = modifier.suggest(word, wordCase); + boolean hasGoodSuggestions = + new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase).suggest(); if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) { - suggestions.addAll( + List generated = new GeneratingSuggester(suggestionSpeller) - .suggest(dictionary.toLowerCase(word), wordCase, suggestions)); + .suggest(dictionary.toLowerCase(word), wordCase, suggestions); + for (String raw : generated) { + suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller)); + } } - if (word.contains("-") && suggestions.stream().noneMatch(s -> s.contains("-"))) { - suggestions.addAll(modifyChunksBetweenDashes(word)); + if (word.contains("-") && suggestions.stream().noneMatch(s -> s.raw.contains("-"))) { + for (String raw : modifyChunksBetweenDashes(word)) { + suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller)); + } } } - private Runnable checkTimeLimit( - String word, WordCase wordCase, Set suggestions, long timeLimitMs) { + private Runnable checkTimeLimit(String word, Set suggestions, long timeLimitMs) { return new Runnable() { final long deadline = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeLimitMs); int invocationCounter = 100; @@ -603,38 +608,15 @@ public class Hunspell { private void stop() { List partialResult = - policy == RETURN_PARTIAL_RESULT ? null : postprocess(word, wordCase, suggestions); + policy == RETURN_PARTIAL_RESULT ? null : postprocess(suggestions); String message = "Time limit of " + timeLimitMs + "ms exceeded for " + word; throw new SuggestionTimeoutException(message, partialResult); } }; } - private List postprocess(String word, WordCase wordCase, Collection suggestions) { - Set result = new LinkedHashSet<>(); - for (String candidate : suggestions) { - result.add(adjustSuggestionCase(candidate, wordCase, word)); - if (wordCase == WordCase.UPPER && dictionary.checkSharpS && candidate.contains("ß")) { - result.add(candidate); - } - } - return result.stream().map(this::cleanOutput).collect(Collectors.toList()); - } - - private String adjustSuggestionCase(String candidate, WordCase originalCase, String original) { - if (originalCase == WordCase.UPPER) { - String upper = candidate.toUpperCase(Locale.ROOT); - if (upper.contains(" ") || spell(upper)) { - return upper; - } - } - if (Character.isUpperCase(original.charAt(0))) { - String title = Character.toUpperCase(candidate.charAt(0)) + candidate.substring(1); - if (title.contains(" ") || spell(title)) { - return title; - } - } - return candidate; + private List postprocess(Collection suggestions) { + return suggestions.stream().flatMap(s -> Arrays.stream(s.result)).distinct().toList(); } private List modifyChunksBetweenDashes(String word) { @@ -662,12 +644,4 @@ public class Hunspell { } return result; } - - private String cleanOutput(String s) { - if (dictionary.oconv == null) return s; - - StringBuilder sb = new StringBuilder(s); - dictionary.oconv.applyMappings(sb); - return sb.toString(); - } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java index 8f90a416bb4..ab26bf9c3b4 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java @@ -25,42 +25,50 @@ import java.util.Locale; /** A class that modifies the given misspelled word in various ways to get correct suggestions */ class ModifyingSuggester { private static final int MAX_CHAR_DISTANCE = 4; - private final LinkedHashSet result; + private final LinkedHashSet result; + private final String misspelled; + private final WordCase wordCase; private final char[] tryChars; private final Hunspell speller; - ModifyingSuggester(Hunspell speller, LinkedHashSet result) { + ModifyingSuggester( + Hunspell speller, LinkedHashSet result, String misspelled, WordCase wordCase) { this.speller = speller; tryChars = speller.dictionary.tryChars.toCharArray(); this.result = result; + this.misspelled = misspelled; + this.wordCase = wordCase; } /** @return whether any of the added suggestions are considered "good" */ - boolean suggest(String word, WordCase wordCase) { - String low = wordCase != WordCase.LOWER ? speller.dictionary.toLowerCase(word) : word; + boolean suggest() { + String low = + wordCase != WordCase.LOWER ? speller.dictionary.toLowerCase(misspelled) : misspelled; if (wordCase == WordCase.UPPER || wordCase == WordCase.MIXED) { trySuggestion(low); } - boolean hasGoodSuggestions = tryVariationsOf(word); + boolean hasGoodSuggestions = tryVariationsOf(misspelled); if (wordCase == WordCase.TITLE) { hasGoodSuggestions |= tryVariationsOf(low); } else if (wordCase == WordCase.UPPER) { hasGoodSuggestions |= tryVariationsOf(low); - hasGoodSuggestions |= tryVariationsOf(speller.dictionary.toTitleCase(word)); + hasGoodSuggestions |= tryVariationsOf(speller.dictionary.toTitleCase(misspelled)); } else if (wordCase == WordCase.MIXED) { - int dot = word.indexOf('.'); - if (dot > 0 - && dot < word.length() - 1 - && WordCase.caseOf(word.substring(dot + 1)) == WordCase.TITLE) { - result.add(word.substring(0, dot + 1) + " " + word.substring(dot + 1)); + int dot = misspelled.indexOf('.'); + if (dot > 0 && dot < misspelled.length() - 1) { + String afterDot = misspelled.substring(dot + 1); + if (WordCase.caseOf(afterDot) == WordCase.TITLE) { + result.add(createSuggestion(misspelled.substring(0, dot + 1) + " " + afterDot)); + } } - boolean capitalized = Character.isUpperCase(word.charAt(0)); + char first = misspelled.charAt(0); + boolean capitalized = Character.isUpperCase(first); if (capitalized) { hasGoodSuggestions |= - tryVariationsOf(speller.dictionary.caseFold(word.charAt(0)) + word.substring(1)); + tryVariationsOf(speller.dictionary.caseFold(first) + misspelled.substring(1)); } hasGoodSuggestions |= tryVariationsOf(low); @@ -69,29 +77,38 @@ class ModifyingSuggester { hasGoodSuggestions |= tryVariationsOf(speller.dictionary.toTitleCase(low)); } - List adjusted = new ArrayList<>(); - for (String candidate : result) { - String s = capitalizeAfterSpace(word, candidate); - adjusted.add(s.equals(candidate) ? adjusted.size() : 0, s); + List reordered = new ArrayList<>(); + for (Suggestion candidate : result) { + Suggestion changed = capitalizeAfterSpace(candidate.raw); + if (changed == null) { + reordered.add(candidate); + } else { + reordered.add(0, changed); + } } result.clear(); - result.addAll(adjusted); + result.addAll(reordered); } return hasGoodSuggestions; } + private Suggestion createSuggestion(String candidate) { + return new Suggestion(candidate, misspelled, wordCase, speller); + } + // aNew -> "a New" (instead of "a new") - private String capitalizeAfterSpace(String misspelled, String candidate) { + private Suggestion capitalizeAfterSpace(String candidate) { int space = candidate.indexOf(' '); int tail = candidate.length() - space - 1; if (space > 0 && !misspelled.regionMatches(misspelled.length() - tail, candidate, space + 1, tail)) { - return candidate.substring(0, space + 1) - + Character.toUpperCase(candidate.charAt(space + 1)) - + candidate.substring(space + 2); + return createSuggestion( + candidate.substring(0, space + 1) + + Character.toUpperCase(candidate.charAt(space + 1)) + + candidate.substring(space + 2)); } - return candidate; + return null; } private boolean tryVariationsOf(String word) { @@ -111,9 +128,9 @@ class ModifyingSuggester { tryReplacingChar(word); tryTwoDuplicateChars(word); - List goodSplit = checkDictionaryForSplitSuggestions(word); + List goodSplit = checkDictionaryForSplitSuggestions(word); if (!goodSplit.isEmpty()) { - List copy = new ArrayList<>(result); + List copy = new ArrayList<>(result); result.clear(); result.addAll(goodSplit); if (hasGoodSuggestions) { @@ -139,7 +156,7 @@ class ModifyingSuggester { if (candidate.contains(" ") && Arrays.stream(candidate.split(" ")).allMatch(this::checkSimpleWord)) { - result.add(candidate); + result.add(createSuggestion(candidate)); } } } @@ -294,19 +311,19 @@ class ModifyingSuggester { } } - private List checkDictionaryForSplitSuggestions(String word) { - List result = new ArrayList<>(); + private List checkDictionaryForSplitSuggestions(String word) { + List result = new ArrayList<>(); for (int i = 1; i < word.length() - 1; i++) { String w1 = word.substring(0, i); String w2 = word.substring(i); String spaced = w1 + " " + w2; if (speller.checkWord(spaced)) { - result.add(spaced); + result.add(createSuggestion(spaced)); } if (shouldSplitByDash()) { String dashed = w1 + "-" + w2; if (speller.checkWord(dashed)) { - result.add(dashed); + result.add(createSuggestion(dashed)); } } } @@ -318,9 +335,9 @@ class ModifyingSuggester { String w1 = word.substring(0, i); String w2 = word.substring(i); if (checkSimpleWord(w1) && checkSimpleWord(w2)) { - result.add(w1 + " " + w2); + result.add(createSuggestion(w1 + " " + w2)); if (w1.length() > 1 && w2.length() > 1 && shouldSplitByDash()) { - result.add(w1 + "-" + w2); + result.add(createSuggestion(w1 + "-" + w2)); } } } @@ -331,6 +348,6 @@ class ModifyingSuggester { } private boolean trySuggestion(String candidate) { - return speller.checkWord(candidate) && result.add(candidate); + return speller.checkWord(candidate) && result.add(createSuggestion(candidate)); } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Suggestion.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Suggestion.java new file mode 100644 index 00000000000..9c3f4278dee --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Suggestion.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import java.util.Objects; + +class Suggestion { + final String raw; + final String[] result; + + Suggestion(String raw, String misspelled, WordCase originalCase, Hunspell speller) { + this.raw = raw; + + List result = new ArrayList<>(); + String adjusted = adjustSuggestionCase(raw, misspelled, originalCase); + result.add( + cleanOutput(speller, adjusted.contains(" ") || speller.spell(adjusted) ? adjusted : raw)); + if (originalCase == WordCase.UPPER && speller.dictionary.checkSharpS && raw.contains("ß")) { + result.add(cleanOutput(speller, raw)); + } + this.result = result.toArray(new String[0]); + } + + private String adjustSuggestionCase(String candidate, String misspelled, WordCase originalCase) { + if (originalCase == WordCase.UPPER) { + return candidate.toUpperCase(Locale.ROOT); + } + if (Character.isUpperCase(misspelled.charAt(0))) { + return Character.toUpperCase(candidate.charAt(0)) + candidate.substring(1); + } + return candidate; + } + + private String cleanOutput(Hunspell speller, String s) { + if (speller.dictionary.oconv == null) return s; + + StringBuilder sb = new StringBuilder(s); + speller.dictionary.oconv.applyMappings(sb); + return sb.toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof Suggestion)) return false; + Suggestion that = (Suggestion) o; + return raw.equals(that.raw) && Arrays.equals(result, that.result); + } + + @Override + public int hashCode() { + return 31 * Objects.hash(raw) + Arrays.hashCode(result); + } + + @Override + public String toString() { + return raw; + } +}