[hunspell] speed up WordFormGenerator (#11904)

This commit is contained in:
Peter Gromov 2022-11-07 19:41:17 +01:00 committed by GitHub
parent a8120bcb32
commit 682e5c94e8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 112 additions and 60 deletions

View File

@ -31,8 +31,30 @@ import org.apache.lucene.util.automaton.RegExp;
*/
interface AffixCondition {
String ALWAYS_TRUE_KEY = ".*";
AffixCondition ALWAYS_TRUE = (word, offset, length) -> true;
AffixCondition ALWAYS_FALSE = (word, offset, length) -> false;
AffixCondition ALWAYS_TRUE =
new AffixCondition() {
@Override
public boolean acceptsStem(String stem) {
return true;
}
@Override
public boolean acceptsStem(char[] word, int offset, int length) {
return true;
}
};
AffixCondition ALWAYS_FALSE =
new AffixCondition() {
@Override
public boolean acceptsStem(String stem) {
return false;
}
@Override
public boolean acceptsStem(char[] word, int offset, int length) {
return false;
}
};
default boolean acceptsStem(String stem) {
return acceptsStem(stem.toCharArray(), 0, stem.length());

View File

@ -60,6 +60,17 @@ class FlagEnumerator {
return new Lookup(result);
}
static boolean hasFlagInSortedArray(char flag, char[] array, int start, int length) {
if (flag == Dictionary.FLAG_UNSET) return false;
for (int i = start; i < start + length; i++) {
char c = array[i];
if (c == flag) return true;
if (c > flag) return false;
}
return false;
}
static class Lookup {
private final char[] data;
@ -68,15 +79,7 @@ class FlagEnumerator {
}
boolean hasFlag(int entryId, char flag) {
if (entryId < 0 || flag == Dictionary.FLAG_UNSET) return false;
int length = data[entryId];
for (int i = entryId + 1; i < entryId + 1 + length; i++) {
char c = data[i];
if (c == flag) return true;
if (c > flag) return false;
}
return false;
return entryId >= 0 && hasFlagInSortedArray(flag, data, entryId + 1, data[entryId]);
}
boolean hasAnyFlag(int entryId, char[] sortedFlags) {

View File

@ -19,10 +19,12 @@ package org.apache.lucene.analysis.hunspell;
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_APPEND;
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
import static org.apache.lucene.analysis.hunspell.Dictionary.toSortedCharArray;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
@ -109,14 +111,14 @@ public class WordFormGenerator {
* by throwing an exception
*/
public List<AffixedWord> getAllWordForms(String root, Runnable checkCanceled) {
Set<AffixedWord> result = new LinkedHashSet<>();
List<AffixedWord> result = new ArrayList<>();
DictEntries entries = dictionary.lookupEntries(root);
if (entries != null) {
for (DictEntry entry : entries) {
result.addAll(getAllWordForms(root, entry.getFlags(), checkCanceled));
}
}
return new ArrayList<>(result);
return result;
}
/**
@ -128,20 +130,40 @@ public class WordFormGenerator {
* by throwing an exception
*/
public List<AffixedWord> getAllWordForms(String stem, String flags, Runnable checkCanceled) {
var encodedFlags = toSet(dictionary.flagParsingStrategy.parseUtfFlags(flags));
var encodedFlags = dictionary.flagParsingStrategy.parseUtfFlags(flags);
if (!shouldConsiderAtAll(encodedFlags)) return List.of();
LinkedHashSet<AffixedWord> result = new LinkedHashSet<>();
encodedFlags = sortAndDeduplicate(encodedFlags);
List<AffixedWord> result = new ArrayList<>();
AffixedWord bare = new AffixedWord(stem, DictEntry.create(stem, flags), List.of(), List.of());
checkCanceled.run();
if (!encodedFlags.contains(dictionary.needaffix)) {
if (!FlagEnumerator.hasFlagInSortedArray(
dictionary.needaffix, encodedFlags, 0, encodedFlags.length)) {
result.add(bare);
}
result.addAll(expand(bare, encodedFlags, checkCanceled));
return new ArrayList<>(result);
return result;
}
private boolean canStemToOriginal(AffixedWord derived) {
private static char[] sortAndDeduplicate(char[] flags) {
Arrays.sort(flags);
for (int i = 1; i < flags.length; i++) {
if (flags[i] == flags[i - 1]) {
return deduplicate(flags);
}
}
return flags;
}
private static char[] deduplicate(char[] flags) {
Set<Character> set = new HashSet<>();
for (char flag : flags) {
set.add(flag);
}
return toSortedCharArray(set);
}
protected boolean canStemToOriginal(AffixedWord derived) {
String word = derived.getWord();
char[] chars = word.toCharArray();
if (isForbiddenWord(chars, 0, chars.length)) {
@ -190,26 +212,20 @@ public class WordFormGenerator {
return false;
}
private static LinkedHashSet<Character> toSet(char[] flags) {
LinkedHashSet<Character> set = new LinkedHashSet<>();
for (char c : flags) {
set.add(c);
}
return set;
}
private LinkedHashSet<AffixedWord> expand(
AffixedWord stem, LinkedHashSet<Character> flags, Runnable checkCanceled) {
LinkedHashSet<AffixedWord> result = new LinkedHashSet<>();
for (Character flag : flags) {
private List<AffixedWord> expand(AffixedWord stem, char[] flags, Runnable checkCanceled) {
List<AffixedWord> result = new ArrayList<>();
for (char flag : flags) {
List<AffixEntry> entries = affixes.get(flag);
if (entries == null) continue;
AffixKind kind = entries.get(0).kind;
if (!isCompatibleWithPreviousAffixes(stem, kind, flag)) continue;
for (AffixEntry affix : entries) {
checkCanceled.run();
AffixedWord derived = affix.apply(stem, dictionary);
if (derived != null) {
LinkedHashSet<Character> append = appendFlags(affix);
char[] append = appendFlags(affix);
if (shouldConsiderAtAll(append)) {
if (canStemToOriginal(derived)) {
result.add(derived);
@ -224,25 +240,37 @@ public class WordFormGenerator {
return result;
}
private boolean shouldConsiderAtAll(Set<Character> flags) {
return !flags.contains(dictionary.compoundBegin)
&& !flags.contains(dictionary.compoundMiddle)
&& !flags.contains(dictionary.compoundEnd)
&& !flags.contains(dictionary.forbiddenword)
&& !flags.contains(dictionary.onlyincompound);
private boolean shouldConsiderAtAll(char[] flags) {
for (char flag : flags) {
if (flag == dictionary.compoundBegin
|| flag == dictionary.compoundMiddle
|| flag == dictionary.compoundEnd
|| flag == dictionary.forbiddenword
|| flag == dictionary.onlyincompound) {
return false;
}
}
return true;
}
private LinkedHashSet<Character> updateFlags(
Set<Character> flags, Character toRemove, Set<Character> toAppend) {
LinkedHashSet<Character> copy = new LinkedHashSet<>(flags);
copy.remove(toRemove);
copy.addAll(toAppend);
return copy;
private char[] updateFlags(char[] flags, char toRemove, char[] toAppend) {
char[] result = new char[flags.length + toAppend.length - 1];
int index = 0;
for (char flag : flags) {
if (flag != toRemove && flag != dictionary.needaffix) {
result[index++] = flag;
}
}
for (char flag : toAppend) {
result[index++] = flag;
}
return sortAndDeduplicate(result);
}
private LinkedHashSet<Character> appendFlags(AffixEntry affix) {
private char[] appendFlags(AffixEntry affix) {
char appendId = dictionary.affixData(affix.id, AFFIX_APPEND);
return appendId <= 0 ? new LinkedHashSet<>() : toSet(dictionary.flagLookup.getFlags(appendId));
return appendId == 0 ? new char[0] : dictionary.flagLookup.getFlags(appendId);
}
/**
@ -272,9 +300,8 @@ public class WordFormGenerator {
private record AffixEntry(
int id, char flag, AffixKind kind, String affix, String strip, AffixCondition condition) {
AffixedWord apply(AffixedWord stem, Dictionary dictionary) {
if (!isCompatibleWithPreviousAffixes(stem, dictionary)) return null;
AffixedWord apply(AffixedWord stem, Dictionary dictionary) {
String word = stem.getWord();
boolean isPrefix = kind == AffixKind.PREFIX;
if (!(isPrefix ? word.startsWith(strip) : word.endsWith(strip))) return null;
@ -286,24 +313,24 @@ public class WordFormGenerator {
if (!condition.acceptsStem(stripped)) return null;
String applied = isPrefix ? affix + stripped : stripped + affix;
List<Affix> prefixes = new ArrayList<>(stem.getPrefixes());
List<Affix> suffixes = new ArrayList<>(stem.getSuffixes());
List<Affix> prefixes = isPrefix ? new ArrayList<>(stem.getPrefixes()) : stem.getPrefixes();
List<Affix> suffixes = isPrefix ? stem.getSuffixes() : new ArrayList<>(stem.getSuffixes());
(isPrefix ? prefixes : suffixes).add(0, new Affix(dictionary, id));
return new AffixedWord(applied, stem.getDictEntry(), prefixes, suffixes);
}
}
private boolean isCompatibleWithPreviousAffixes(AffixedWord stem, Dictionary dictionary) {
boolean isPrefix = kind == AffixKind.PREFIX;
List<Affix> sameAffixes = isPrefix ? stem.getPrefixes() : stem.getSuffixes();
if (sameAffixes.size() == 2) return false;
if (isPrefix && sameAffixes.size() == 1 && !dictionary.complexPrefixes) return false;
if (!isPrefix && !stem.getPrefixes().isEmpty()) return false;
if (sameAffixes.size() == 1
&& !dictionary.isFlagAppendedByAffix(sameAffixes.get(0).affixId, flag)) {
return false;
}
return true;
private boolean isCompatibleWithPreviousAffixes(AffixedWord stem, AffixKind kind, char flag) {
boolean isPrefix = kind == AffixKind.PREFIX;
List<Affix> sameAffixes = isPrefix ? stem.getPrefixes() : stem.getSuffixes();
int size = sameAffixes.size();
if (size == 2) return false;
if (isPrefix && size == 1 && !dictionary.complexPrefixes) return false;
if (!isPrefix && !stem.getPrefixes().isEmpty()) return false;
if (size == 1 && !dictionary.isFlagAppendedByAffix(sameAffixes.get(0).affixId, flag)) {
return false;
}
return true;
}
private class WordCompressor {