[hunspell] speed up WordFormGenerator (#11904)

This commit is contained in:
Peter Gromov 2022-11-07 19:41:17 +01:00 committed by GitHub
parent a8120bcb32
commit 682e5c94e8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 112 additions and 60 deletions

View File

@ -31,8 +31,30 @@ import org.apache.lucene.util.automaton.RegExp;
*/ */
interface AffixCondition { interface AffixCondition {
String ALWAYS_TRUE_KEY = ".*"; String ALWAYS_TRUE_KEY = ".*";
AffixCondition ALWAYS_TRUE = (word, offset, length) -> true; AffixCondition ALWAYS_TRUE =
AffixCondition ALWAYS_FALSE = (word, offset, length) -> false; new AffixCondition() {
@Override
public boolean acceptsStem(String stem) {
return true;
}
@Override
public boolean acceptsStem(char[] word, int offset, int length) {
return true;
}
};
AffixCondition ALWAYS_FALSE =
new AffixCondition() {
@Override
public boolean acceptsStem(String stem) {
return false;
}
@Override
public boolean acceptsStem(char[] word, int offset, int length) {
return false;
}
};
default boolean acceptsStem(String stem) { default boolean acceptsStem(String stem) {
return acceptsStem(stem.toCharArray(), 0, stem.length()); return acceptsStem(stem.toCharArray(), 0, stem.length());

View File

@ -60,6 +60,17 @@ class FlagEnumerator {
return new Lookup(result); return new Lookup(result);
} }
static boolean hasFlagInSortedArray(char flag, char[] array, int start, int length) {
if (flag == Dictionary.FLAG_UNSET) return false;
for (int i = start; i < start + length; i++) {
char c = array[i];
if (c == flag) return true;
if (c > flag) return false;
}
return false;
}
static class Lookup { static class Lookup {
private final char[] data; private final char[] data;
@ -68,15 +79,7 @@ class FlagEnumerator {
} }
boolean hasFlag(int entryId, char flag) { boolean hasFlag(int entryId, char flag) {
if (entryId < 0 || flag == Dictionary.FLAG_UNSET) return false; return entryId >= 0 && hasFlagInSortedArray(flag, data, entryId + 1, data[entryId]);
int length = data[entryId];
for (int i = entryId + 1; i < entryId + 1 + length; i++) {
char c = data[i];
if (c == flag) return true;
if (c > flag) return false;
}
return false;
} }
boolean hasAnyFlag(int entryId, char[] sortedFlags) { boolean hasAnyFlag(int entryId, char[] sortedFlags) {

View File

@ -19,10 +19,12 @@ package org.apache.lucene.analysis.hunspell;
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_APPEND; import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_APPEND;
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG; import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET; import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
import static org.apache.lucene.analysis.hunspell.Dictionary.toSortedCharArray;
import java.io.IOException; import java.io.IOException;
import java.io.UncheckedIOException; import java.io.UncheckedIOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
@ -109,14 +111,14 @@ public class WordFormGenerator {
* by throwing an exception * by throwing an exception
*/ */
public List<AffixedWord> getAllWordForms(String root, Runnable checkCanceled) { public List<AffixedWord> getAllWordForms(String root, Runnable checkCanceled) {
Set<AffixedWord> result = new LinkedHashSet<>(); List<AffixedWord> result = new ArrayList<>();
DictEntries entries = dictionary.lookupEntries(root); DictEntries entries = dictionary.lookupEntries(root);
if (entries != null) { if (entries != null) {
for (DictEntry entry : entries) { for (DictEntry entry : entries) {
result.addAll(getAllWordForms(root, entry.getFlags(), checkCanceled)); result.addAll(getAllWordForms(root, entry.getFlags(), checkCanceled));
} }
} }
return new ArrayList<>(result); return result;
} }
/** /**
@ -128,20 +130,40 @@ public class WordFormGenerator {
* by throwing an exception * by throwing an exception
*/ */
public List<AffixedWord> getAllWordForms(String stem, String flags, Runnable checkCanceled) { public List<AffixedWord> getAllWordForms(String stem, String flags, Runnable checkCanceled) {
var encodedFlags = toSet(dictionary.flagParsingStrategy.parseUtfFlags(flags)); var encodedFlags = dictionary.flagParsingStrategy.parseUtfFlags(flags);
if (!shouldConsiderAtAll(encodedFlags)) return List.of(); if (!shouldConsiderAtAll(encodedFlags)) return List.of();
LinkedHashSet<AffixedWord> result = new LinkedHashSet<>(); encodedFlags = sortAndDeduplicate(encodedFlags);
List<AffixedWord> result = new ArrayList<>();
AffixedWord bare = new AffixedWord(stem, DictEntry.create(stem, flags), List.of(), List.of()); AffixedWord bare = new AffixedWord(stem, DictEntry.create(stem, flags), List.of(), List.of());
checkCanceled.run(); checkCanceled.run();
if (!encodedFlags.contains(dictionary.needaffix)) { if (!FlagEnumerator.hasFlagInSortedArray(
dictionary.needaffix, encodedFlags, 0, encodedFlags.length)) {
result.add(bare); result.add(bare);
} }
result.addAll(expand(bare, encodedFlags, checkCanceled)); result.addAll(expand(bare, encodedFlags, checkCanceled));
return new ArrayList<>(result); return result;
} }
private boolean canStemToOriginal(AffixedWord derived) { private static char[] sortAndDeduplicate(char[] flags) {
Arrays.sort(flags);
for (int i = 1; i < flags.length; i++) {
if (flags[i] == flags[i - 1]) {
return deduplicate(flags);
}
}
return flags;
}
private static char[] deduplicate(char[] flags) {
Set<Character> set = new HashSet<>();
for (char flag : flags) {
set.add(flag);
}
return toSortedCharArray(set);
}
protected boolean canStemToOriginal(AffixedWord derived) {
String word = derived.getWord(); String word = derived.getWord();
char[] chars = word.toCharArray(); char[] chars = word.toCharArray();
if (isForbiddenWord(chars, 0, chars.length)) { if (isForbiddenWord(chars, 0, chars.length)) {
@ -190,26 +212,20 @@ public class WordFormGenerator {
return false; return false;
} }
private static LinkedHashSet<Character> toSet(char[] flags) { private List<AffixedWord> expand(AffixedWord stem, char[] flags, Runnable checkCanceled) {
LinkedHashSet<Character> set = new LinkedHashSet<>(); List<AffixedWord> result = new ArrayList<>();
for (char c : flags) { for (char flag : flags) {
set.add(c);
}
return set;
}
private LinkedHashSet<AffixedWord> expand(
AffixedWord stem, LinkedHashSet<Character> flags, Runnable checkCanceled) {
LinkedHashSet<AffixedWord> result = new LinkedHashSet<>();
for (Character flag : flags) {
List<AffixEntry> entries = affixes.get(flag); List<AffixEntry> entries = affixes.get(flag);
if (entries == null) continue; if (entries == null) continue;
AffixKind kind = entries.get(0).kind;
if (!isCompatibleWithPreviousAffixes(stem, kind, flag)) continue;
for (AffixEntry affix : entries) { for (AffixEntry affix : entries) {
checkCanceled.run(); checkCanceled.run();
AffixedWord derived = affix.apply(stem, dictionary); AffixedWord derived = affix.apply(stem, dictionary);
if (derived != null) { if (derived != null) {
LinkedHashSet<Character> append = appendFlags(affix); char[] append = appendFlags(affix);
if (shouldConsiderAtAll(append)) { if (shouldConsiderAtAll(append)) {
if (canStemToOriginal(derived)) { if (canStemToOriginal(derived)) {
result.add(derived); result.add(derived);
@ -224,25 +240,37 @@ public class WordFormGenerator {
return result; return result;
} }
private boolean shouldConsiderAtAll(Set<Character> flags) { private boolean shouldConsiderAtAll(char[] flags) {
return !flags.contains(dictionary.compoundBegin) for (char flag : flags) {
&& !flags.contains(dictionary.compoundMiddle) if (flag == dictionary.compoundBegin
&& !flags.contains(dictionary.compoundEnd) || flag == dictionary.compoundMiddle
&& !flags.contains(dictionary.forbiddenword) || flag == dictionary.compoundEnd
&& !flags.contains(dictionary.onlyincompound); || flag == dictionary.forbiddenword
|| flag == dictionary.onlyincompound) {
return false;
}
}
return true;
} }
private LinkedHashSet<Character> updateFlags( private char[] updateFlags(char[] flags, char toRemove, char[] toAppend) {
Set<Character> flags, Character toRemove, Set<Character> toAppend) { char[] result = new char[flags.length + toAppend.length - 1];
LinkedHashSet<Character> copy = new LinkedHashSet<>(flags); int index = 0;
copy.remove(toRemove); for (char flag : flags) {
copy.addAll(toAppend); if (flag != toRemove && flag != dictionary.needaffix) {
return copy; result[index++] = flag;
}
}
for (char flag : toAppend) {
result[index++] = flag;
}
return sortAndDeduplicate(result);
} }
private LinkedHashSet<Character> appendFlags(AffixEntry affix) { private char[] appendFlags(AffixEntry affix) {
char appendId = dictionary.affixData(affix.id, AFFIX_APPEND); char appendId = dictionary.affixData(affix.id, AFFIX_APPEND);
return appendId <= 0 ? new LinkedHashSet<>() : toSet(dictionary.flagLookup.getFlags(appendId)); return appendId == 0 ? new char[0] : dictionary.flagLookup.getFlags(appendId);
} }
/** /**
@ -272,9 +300,8 @@ public class WordFormGenerator {
private record AffixEntry( private record AffixEntry(
int id, char flag, AffixKind kind, String affix, String strip, AffixCondition condition) { int id, char flag, AffixKind kind, String affix, String strip, AffixCondition condition) {
AffixedWord apply(AffixedWord stem, Dictionary dictionary) {
if (!isCompatibleWithPreviousAffixes(stem, dictionary)) return null;
AffixedWord apply(AffixedWord stem, Dictionary dictionary) {
String word = stem.getWord(); String word = stem.getWord();
boolean isPrefix = kind == AffixKind.PREFIX; boolean isPrefix = kind == AffixKind.PREFIX;
if (!(isPrefix ? word.startsWith(strip) : word.endsWith(strip))) return null; if (!(isPrefix ? word.startsWith(strip) : word.endsWith(strip))) return null;
@ -286,24 +313,24 @@ public class WordFormGenerator {
if (!condition.acceptsStem(stripped)) return null; if (!condition.acceptsStem(stripped)) return null;
String applied = isPrefix ? affix + stripped : stripped + affix; String applied = isPrefix ? affix + stripped : stripped + affix;
List<Affix> prefixes = new ArrayList<>(stem.getPrefixes()); List<Affix> prefixes = isPrefix ? new ArrayList<>(stem.getPrefixes()) : stem.getPrefixes();
List<Affix> suffixes = new ArrayList<>(stem.getSuffixes()); List<Affix> suffixes = isPrefix ? stem.getSuffixes() : new ArrayList<>(stem.getSuffixes());
(isPrefix ? prefixes : suffixes).add(0, new Affix(dictionary, id)); (isPrefix ? prefixes : suffixes).add(0, new Affix(dictionary, id));
return new AffixedWord(applied, stem.getDictEntry(), prefixes, suffixes); return new AffixedWord(applied, stem.getDictEntry(), prefixes, suffixes);
} }
}
private boolean isCompatibleWithPreviousAffixes(AffixedWord stem, Dictionary dictionary) { private boolean isCompatibleWithPreviousAffixes(AffixedWord stem, AffixKind kind, char flag) {
boolean isPrefix = kind == AffixKind.PREFIX; boolean isPrefix = kind == AffixKind.PREFIX;
List<Affix> sameAffixes = isPrefix ? stem.getPrefixes() : stem.getSuffixes(); List<Affix> sameAffixes = isPrefix ? stem.getPrefixes() : stem.getSuffixes();
if (sameAffixes.size() == 2) return false; int size = sameAffixes.size();
if (isPrefix && sameAffixes.size() == 1 && !dictionary.complexPrefixes) return false; if (size == 2) return false;
if (!isPrefix && !stem.getPrefixes().isEmpty()) return false; if (isPrefix && size == 1 && !dictionary.complexPrefixes) return false;
if (sameAffixes.size() == 1 if (!isPrefix && !stem.getPrefixes().isEmpty()) return false;
&& !dictionary.isFlagAppendedByAffix(sameAffixes.get(0).affixId, flag)) { if (size == 1 && !dictionary.isFlagAppendedByAffix(sameAffixes.get(0).affixId, flag)) {
return false; return false;
}
return true;
} }
return true;
} }
private class WordCompressor { private class WordCompressor {