From f783848e710f56f410906c235138842b5ac04b69 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Mon, 22 Feb 2021 11:35:36 +0100 Subject: [PATCH] LUCENE-9799: Hunspell: don't check second-level affixes when the first level isn't a continuation (#2413) * LUCENE-9799: Hunspell: don't check second-level affixes when the first level isn't a continuation * check more words in TestPerformance --- .../lucene/analysis/hunspell/Dictionary.java | 46 +++++++++++++------ .../lucene/analysis/hunspell/Stemmer.java | 6 +-- .../analysis/hunspell/TestPerformance.java | 4 +- 3 files changed, 37 insertions(+), 19 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 5a23d76b218..a3cbe39a402 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -37,10 +37,10 @@ import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.TreeMap; @@ -78,9 +78,6 @@ public class Dictionary { private static final int DEFAULT_FLAGS = 65510; static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell - // TODO: really for suffixes we should reverse the automaton and run them backwards - private static final String PREFIX_CONDITION_REGEX = "%s.*"; - private static final String SUFFIX_CONDITION_REGEX = ".*%s"; static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1; CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET); @@ -147,8 +144,12 @@ public class Dictionary { boolean ignoreCase; boolean checkSharpS; boolean complexPrefixes; - // if no affixes have continuation classes, no need to do 2-level affix stripping - boolean twoStageAffix; + + /** + * All flags used in affix continuation classes. If an outer affix's flag isn't here, there's no + * need to do 2-level affix stripping with it. + */ + private char[] secondStageAffixFlags; char circumfix; char keepcase, forceUCase; @@ -332,6 +333,7 @@ public class Dictionary { throws IOException, ParseException { TreeMap> prefixes = new TreeMap<>(); TreeMap> suffixes = new TreeMap<>(); + Set stage2Flags = new HashSet<>(); Map seenPatterns = new HashMap<>(); // zero condition -> 0 ord @@ -359,9 +361,9 @@ public class Dictionary { } else if ("AM".equals(firstWord)) { parseMorphAlias(line); } else if ("PFX".equals(firstWord)) { - parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX, seenPatterns, seenStrips, flags); + parseAffix(prefixes, stage2Flags, line, reader, false, seenPatterns, seenStrips, flags); } else if ("SFX".equals(firstWord)) { - parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX, seenPatterns, seenStrips, flags); + parseAffix(suffixes, stage2Flags, line, reader, true, seenPatterns, seenStrips, flags); } else if (line.equals("COMPLEXPREFIXES")) { complexPrefixes = true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix @@ -476,6 +478,7 @@ public class Dictionary { this.prefixes = affixFST(prefixes); this.suffixes = affixFST(suffixes); + secondStageAffixFlags = toSortedCharArray(stage2Flags); int totalChars = 0; for (String strip : seenStrips.keySet()) { @@ -675,16 +678,15 @@ public class Dictionary { * @param affixes Map where the result of the parsing will be put * @param header Header line of the affix rule * @param reader BufferedReader to read the content of the rule from - * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate - * the condition regex pattern * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ private void parseAffix( TreeMap> affixes, + Set secondStageFlags, String header, LineNumberReader reader, - String conditionPattern, + boolean isSuffix, Map seenPatterns, Map seenStrips, FlagEnumerator flags) @@ -694,7 +696,6 @@ public class Dictionary { String[] args = header.split("\\s+"); boolean crossProduct = args[2].equals("Y"); - boolean isSuffix = conditionPattern.equals(SUFFIX_CONDITION_REGEX); int numLines; try { @@ -725,7 +726,9 @@ public class Dictionary { } appendFlags = flagParsingStrategy.parseFlags(flagPart); - twoStageAffix = true; + for (char appendFlag : appendFlags) { + secondStageFlags.add(appendFlag); + } } // zero affix -> empty string if ("0".equals(affixArg)) { @@ -750,7 +753,8 @@ public class Dictionary { // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! // but this is complicated... } else { - regex = String.format(Locale.ROOT, conditionPattern, condition); + // TODO: really for suffixes we should reverse the automaton and run them backwards + regex = isSuffix ? ".*" + condition : condition + ".*"; } // deduplicate patterns @@ -1610,6 +1614,20 @@ public class Dictionary { return reuse; } + private static char[] toSortedCharArray(Set set) { + char[] chars = new char[set.size()]; + int i = 0; + for (Character c : set) { + chars[i++] = c; + } + Arrays.sort(chars); + return chars; + } + + boolean isSecondStageAffix(char flag) { + return Arrays.binarySearch(secondStageAffixFlags, flag) >= 0; + } + /** folds single character (according to LANG if present) */ char caseFold(char c) { if (alternateCasing) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index 41648925951..2a75f45d6e1 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -648,11 +648,11 @@ final class Stemmer { if (recursionDepth == 0) { if (prefix) { prefixId = affix; - doPrefix = dictionary.complexPrefixes && dictionary.twoStageAffix; + doPrefix = dictionary.complexPrefixes && dictionary.isSecondStageAffix(flag); // we took away the first prefix. // COMPLEXPREFIXES = true: combine with a second prefix and another suffix // COMPLEXPREFIXES = false: combine with a suffix - } else if (!dictionary.complexPrefixes && dictionary.twoStageAffix) { + } else if (!dictionary.complexPrefixes && dictionary.isSecondStageAffix(flag)) { doPrefix = false; // we took away a suffix. // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed @@ -665,7 +665,7 @@ final class Stemmer { if (prefix && dictionary.complexPrefixes) { prefixId = affix; // we took away the second prefix: go look for another suffix - } else if (prefix || dictionary.complexPrefixes || !dictionary.twoStageAffix) { + } else if (prefix || dictionary.complexPrefixes || !dictionary.isSecondStageAffix(flag)) { return true; } // we took away a prefix, then a suffix: go look for another suffix diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java index 4cd54ba8d32..bc9f71cd66a 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java @@ -66,7 +66,7 @@ public class TestPerformance extends LuceneTestCase { @Test public void de() throws Exception { - checkAnalysisPerformance("de", 200_000); + checkAnalysisPerformance("de", 300_000); } @Test @@ -76,7 +76,7 @@ public class TestPerformance extends LuceneTestCase { @Test public void fr() throws Exception { - checkAnalysisPerformance("fr", 40_000); + checkAnalysisPerformance("fr", 80_000); } @Test