From 682e5c94e83320fee929a16f375fd0ebe4ddc99c Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Mon, 7 Nov 2022 19:41:17 +0100 Subject: [PATCH] [hunspell] speed up WordFormGenerator (#11904) --- .../analysis/hunspell/AffixCondition.java | 26 +++- .../analysis/hunspell/FlagEnumerator.java | 21 +-- .../analysis/hunspell/WordFormGenerator.java | 125 +++++++++++------- 3 files changed, 112 insertions(+), 60 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java index 9686f68e102..1d1c51da00d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java @@ -31,8 +31,30 @@ import org.apache.lucene.util.automaton.RegExp; */ interface AffixCondition { String ALWAYS_TRUE_KEY = ".*"; - AffixCondition ALWAYS_TRUE = (word, offset, length) -> true; - AffixCondition ALWAYS_FALSE = (word, offset, length) -> false; + AffixCondition ALWAYS_TRUE = + new AffixCondition() { + @Override + public boolean acceptsStem(String stem) { + return true; + } + + @Override + public boolean acceptsStem(char[] word, int offset, int length) { + return true; + } + }; + AffixCondition ALWAYS_FALSE = + new AffixCondition() { + @Override + public boolean acceptsStem(String stem) { + return false; + } + + @Override + public boolean acceptsStem(char[] word, int offset, int length) { + return false; + } + }; default boolean acceptsStem(String stem) { return acceptsStem(stem.toCharArray(), 0, stem.length()); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlagEnumerator.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlagEnumerator.java index bb31b5ba3da..a7cd45c4a14 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlagEnumerator.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlagEnumerator.java @@ -60,6 +60,17 @@ class FlagEnumerator { return new Lookup(result); } + static boolean hasFlagInSortedArray(char flag, char[] array, int start, int length) { + if (flag == Dictionary.FLAG_UNSET) return false; + + for (int i = start; i < start + length; i++) { + char c = array[i]; + if (c == flag) return true; + if (c > flag) return false; + } + return false; + } + static class Lookup { private final char[] data; @@ -68,15 +79,7 @@ class FlagEnumerator { } boolean hasFlag(int entryId, char flag) { - if (entryId < 0 || flag == Dictionary.FLAG_UNSET) return false; - - int length = data[entryId]; - for (int i = entryId + 1; i < entryId + 1 + length; i++) { - char c = data[i]; - if (c == flag) return true; - if (c > flag) return false; - } - return false; + return entryId >= 0 && hasFlagInSortedArray(flag, data, entryId + 1, data[entryId]); } boolean hasAnyFlag(int entryId, char[] sortedFlags) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java index 043df4fbb1d..bc0531edb89 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java @@ -19,10 +19,12 @@ package org.apache.lucene.analysis.hunspell; import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_APPEND; import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG; import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET; +import static org.apache.lucene.analysis.hunspell.Dictionary.toSortedCharArray; import java.io.IOException; import java.io.UncheckedIOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; @@ -109,14 +111,14 @@ public class WordFormGenerator { * by throwing an exception */ public List getAllWordForms(String root, Runnable checkCanceled) { - Set result = new LinkedHashSet<>(); + List result = new ArrayList<>(); DictEntries entries = dictionary.lookupEntries(root); if (entries != null) { for (DictEntry entry : entries) { result.addAll(getAllWordForms(root, entry.getFlags(), checkCanceled)); } } - return new ArrayList<>(result); + return result; } /** @@ -128,20 +130,40 @@ public class WordFormGenerator { * by throwing an exception */ public List getAllWordForms(String stem, String flags, Runnable checkCanceled) { - var encodedFlags = toSet(dictionary.flagParsingStrategy.parseUtfFlags(flags)); + var encodedFlags = dictionary.flagParsingStrategy.parseUtfFlags(flags); if (!shouldConsiderAtAll(encodedFlags)) return List.of(); - LinkedHashSet result = new LinkedHashSet<>(); + encodedFlags = sortAndDeduplicate(encodedFlags); + List result = new ArrayList<>(); AffixedWord bare = new AffixedWord(stem, DictEntry.create(stem, flags), List.of(), List.of()); checkCanceled.run(); - if (!encodedFlags.contains(dictionary.needaffix)) { + if (!FlagEnumerator.hasFlagInSortedArray( + dictionary.needaffix, encodedFlags, 0, encodedFlags.length)) { result.add(bare); } result.addAll(expand(bare, encodedFlags, checkCanceled)); - return new ArrayList<>(result); + return result; } - private boolean canStemToOriginal(AffixedWord derived) { + private static char[] sortAndDeduplicate(char[] flags) { + Arrays.sort(flags); + for (int i = 1; i < flags.length; i++) { + if (flags[i] == flags[i - 1]) { + return deduplicate(flags); + } + } + return flags; + } + + private static char[] deduplicate(char[] flags) { + Set set = new HashSet<>(); + for (char flag : flags) { + set.add(flag); + } + return toSortedCharArray(set); + } + + protected boolean canStemToOriginal(AffixedWord derived) { String word = derived.getWord(); char[] chars = word.toCharArray(); if (isForbiddenWord(chars, 0, chars.length)) { @@ -190,26 +212,20 @@ public class WordFormGenerator { return false; } - private static LinkedHashSet toSet(char[] flags) { - LinkedHashSet set = new LinkedHashSet<>(); - for (char c : flags) { - set.add(c); - } - return set; - } - - private LinkedHashSet expand( - AffixedWord stem, LinkedHashSet flags, Runnable checkCanceled) { - LinkedHashSet result = new LinkedHashSet<>(); - for (Character flag : flags) { + private List expand(AffixedWord stem, char[] flags, Runnable checkCanceled) { + List result = new ArrayList<>(); + for (char flag : flags) { List entries = affixes.get(flag); if (entries == null) continue; + AffixKind kind = entries.get(0).kind; + if (!isCompatibleWithPreviousAffixes(stem, kind, flag)) continue; + for (AffixEntry affix : entries) { checkCanceled.run(); AffixedWord derived = affix.apply(stem, dictionary); if (derived != null) { - LinkedHashSet append = appendFlags(affix); + char[] append = appendFlags(affix); if (shouldConsiderAtAll(append)) { if (canStemToOriginal(derived)) { result.add(derived); @@ -224,25 +240,37 @@ public class WordFormGenerator { return result; } - private boolean shouldConsiderAtAll(Set flags) { - return !flags.contains(dictionary.compoundBegin) - && !flags.contains(dictionary.compoundMiddle) - && !flags.contains(dictionary.compoundEnd) - && !flags.contains(dictionary.forbiddenword) - && !flags.contains(dictionary.onlyincompound); + private boolean shouldConsiderAtAll(char[] flags) { + for (char flag : flags) { + if (flag == dictionary.compoundBegin + || flag == dictionary.compoundMiddle + || flag == dictionary.compoundEnd + || flag == dictionary.forbiddenword + || flag == dictionary.onlyincompound) { + return false; + } + } + + return true; } - private LinkedHashSet updateFlags( - Set flags, Character toRemove, Set toAppend) { - LinkedHashSet copy = new LinkedHashSet<>(flags); - copy.remove(toRemove); - copy.addAll(toAppend); - return copy; + private char[] updateFlags(char[] flags, char toRemove, char[] toAppend) { + char[] result = new char[flags.length + toAppend.length - 1]; + int index = 0; + for (char flag : flags) { + if (flag != toRemove && flag != dictionary.needaffix) { + result[index++] = flag; + } + } + for (char flag : toAppend) { + result[index++] = flag; + } + return sortAndDeduplicate(result); } - private LinkedHashSet appendFlags(AffixEntry affix) { + private char[] appendFlags(AffixEntry affix) { char appendId = dictionary.affixData(affix.id, AFFIX_APPEND); - return appendId <= 0 ? new LinkedHashSet<>() : toSet(dictionary.flagLookup.getFlags(appendId)); + return appendId == 0 ? new char[0] : dictionary.flagLookup.getFlags(appendId); } /** @@ -272,9 +300,8 @@ public class WordFormGenerator { private record AffixEntry( int id, char flag, AffixKind kind, String affix, String strip, AffixCondition condition) { - AffixedWord apply(AffixedWord stem, Dictionary dictionary) { - if (!isCompatibleWithPreviousAffixes(stem, dictionary)) return null; + AffixedWord apply(AffixedWord stem, Dictionary dictionary) { String word = stem.getWord(); boolean isPrefix = kind == AffixKind.PREFIX; if (!(isPrefix ? word.startsWith(strip) : word.endsWith(strip))) return null; @@ -286,24 +313,24 @@ public class WordFormGenerator { if (!condition.acceptsStem(stripped)) return null; String applied = isPrefix ? affix + stripped : stripped + affix; - List prefixes = new ArrayList<>(stem.getPrefixes()); - List suffixes = new ArrayList<>(stem.getSuffixes()); + List prefixes = isPrefix ? new ArrayList<>(stem.getPrefixes()) : stem.getPrefixes(); + List suffixes = isPrefix ? stem.getSuffixes() : new ArrayList<>(stem.getSuffixes()); (isPrefix ? prefixes : suffixes).add(0, new Affix(dictionary, id)); return new AffixedWord(applied, stem.getDictEntry(), prefixes, suffixes); } + } - private boolean isCompatibleWithPreviousAffixes(AffixedWord stem, Dictionary dictionary) { - boolean isPrefix = kind == AffixKind.PREFIX; - List sameAffixes = isPrefix ? stem.getPrefixes() : stem.getSuffixes(); - if (sameAffixes.size() == 2) return false; - if (isPrefix && sameAffixes.size() == 1 && !dictionary.complexPrefixes) return false; - if (!isPrefix && !stem.getPrefixes().isEmpty()) return false; - if (sameAffixes.size() == 1 - && !dictionary.isFlagAppendedByAffix(sameAffixes.get(0).affixId, flag)) { - return false; - } - return true; + private boolean isCompatibleWithPreviousAffixes(AffixedWord stem, AffixKind kind, char flag) { + boolean isPrefix = kind == AffixKind.PREFIX; + List sameAffixes = isPrefix ? stem.getPrefixes() : stem.getSuffixes(); + int size = sameAffixes.size(); + if (size == 2) return false; + if (isPrefix && size == 1 && !dictionary.complexPrefixes) return false; + if (!isPrefix && !stem.getPrefixes().isEmpty()) return false; + if (size == 1 && !dictionary.isFlagAppendedByAffix(sameAffixes.get(0).affixId, flag)) { + return false; } + return true; } private class WordCompressor {