mirror of https://github.com/apache/lucene.git
[hunspell] speed up WordFormGenerator (#11904)
This commit is contained in:
parent
a8120bcb32
commit
682e5c94e8
|
@ -31,8 +31,30 @@ import org.apache.lucene.util.automaton.RegExp;
|
||||||
*/
|
*/
|
||||||
interface AffixCondition {
|
interface AffixCondition {
|
||||||
String ALWAYS_TRUE_KEY = ".*";
|
String ALWAYS_TRUE_KEY = ".*";
|
||||||
AffixCondition ALWAYS_TRUE = (word, offset, length) -> true;
|
AffixCondition ALWAYS_TRUE =
|
||||||
AffixCondition ALWAYS_FALSE = (word, offset, length) -> false;
|
new AffixCondition() {
|
||||||
|
@Override
|
||||||
|
public boolean acceptsStem(String stem) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean acceptsStem(char[] word, int offset, int length) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
AffixCondition ALWAYS_FALSE =
|
||||||
|
new AffixCondition() {
|
||||||
|
@Override
|
||||||
|
public boolean acceptsStem(String stem) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean acceptsStem(char[] word, int offset, int length) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
default boolean acceptsStem(String stem) {
|
default boolean acceptsStem(String stem) {
|
||||||
return acceptsStem(stem.toCharArray(), 0, stem.length());
|
return acceptsStem(stem.toCharArray(), 0, stem.length());
|
||||||
|
|
|
@ -60,6 +60,17 @@ class FlagEnumerator {
|
||||||
return new Lookup(result);
|
return new Lookup(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static boolean hasFlagInSortedArray(char flag, char[] array, int start, int length) {
|
||||||
|
if (flag == Dictionary.FLAG_UNSET) return false;
|
||||||
|
|
||||||
|
for (int i = start; i < start + length; i++) {
|
||||||
|
char c = array[i];
|
||||||
|
if (c == flag) return true;
|
||||||
|
if (c > flag) return false;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
static class Lookup {
|
static class Lookup {
|
||||||
private final char[] data;
|
private final char[] data;
|
||||||
|
|
||||||
|
@ -68,15 +79,7 @@ class FlagEnumerator {
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean hasFlag(int entryId, char flag) {
|
boolean hasFlag(int entryId, char flag) {
|
||||||
if (entryId < 0 || flag == Dictionary.FLAG_UNSET) return false;
|
return entryId >= 0 && hasFlagInSortedArray(flag, data, entryId + 1, data[entryId]);
|
||||||
|
|
||||||
int length = data[entryId];
|
|
||||||
for (int i = entryId + 1; i < entryId + 1 + length; i++) {
|
|
||||||
char c = data[i];
|
|
||||||
if (c == flag) return true;
|
|
||||||
if (c > flag) return false;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean hasAnyFlag(int entryId, char[] sortedFlags) {
|
boolean hasAnyFlag(int entryId, char[] sortedFlags) {
|
||||||
|
|
|
@ -19,10 +19,12 @@ package org.apache.lucene.analysis.hunspell;
|
||||||
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_APPEND;
|
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_APPEND;
|
||||||
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
|
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
|
||||||
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
|
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
|
||||||
|
import static org.apache.lucene.analysis.hunspell.Dictionary.toSortedCharArray;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.UncheckedIOException;
|
import java.io.UncheckedIOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
@ -109,14 +111,14 @@ public class WordFormGenerator {
|
||||||
* by throwing an exception
|
* by throwing an exception
|
||||||
*/
|
*/
|
||||||
public List<AffixedWord> getAllWordForms(String root, Runnable checkCanceled) {
|
public List<AffixedWord> getAllWordForms(String root, Runnable checkCanceled) {
|
||||||
Set<AffixedWord> result = new LinkedHashSet<>();
|
List<AffixedWord> result = new ArrayList<>();
|
||||||
DictEntries entries = dictionary.lookupEntries(root);
|
DictEntries entries = dictionary.lookupEntries(root);
|
||||||
if (entries != null) {
|
if (entries != null) {
|
||||||
for (DictEntry entry : entries) {
|
for (DictEntry entry : entries) {
|
||||||
result.addAll(getAllWordForms(root, entry.getFlags(), checkCanceled));
|
result.addAll(getAllWordForms(root, entry.getFlags(), checkCanceled));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return new ArrayList<>(result);
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -128,20 +130,40 @@ public class WordFormGenerator {
|
||||||
* by throwing an exception
|
* by throwing an exception
|
||||||
*/
|
*/
|
||||||
public List<AffixedWord> getAllWordForms(String stem, String flags, Runnable checkCanceled) {
|
public List<AffixedWord> getAllWordForms(String stem, String flags, Runnable checkCanceled) {
|
||||||
var encodedFlags = toSet(dictionary.flagParsingStrategy.parseUtfFlags(flags));
|
var encodedFlags = dictionary.flagParsingStrategy.parseUtfFlags(flags);
|
||||||
if (!shouldConsiderAtAll(encodedFlags)) return List.of();
|
if (!shouldConsiderAtAll(encodedFlags)) return List.of();
|
||||||
|
|
||||||
LinkedHashSet<AffixedWord> result = new LinkedHashSet<>();
|
encodedFlags = sortAndDeduplicate(encodedFlags);
|
||||||
|
List<AffixedWord> result = new ArrayList<>();
|
||||||
AffixedWord bare = new AffixedWord(stem, DictEntry.create(stem, flags), List.of(), List.of());
|
AffixedWord bare = new AffixedWord(stem, DictEntry.create(stem, flags), List.of(), List.of());
|
||||||
checkCanceled.run();
|
checkCanceled.run();
|
||||||
if (!encodedFlags.contains(dictionary.needaffix)) {
|
if (!FlagEnumerator.hasFlagInSortedArray(
|
||||||
|
dictionary.needaffix, encodedFlags, 0, encodedFlags.length)) {
|
||||||
result.add(bare);
|
result.add(bare);
|
||||||
}
|
}
|
||||||
result.addAll(expand(bare, encodedFlags, checkCanceled));
|
result.addAll(expand(bare, encodedFlags, checkCanceled));
|
||||||
return new ArrayList<>(result);
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean canStemToOriginal(AffixedWord derived) {
|
private static char[] sortAndDeduplicate(char[] flags) {
|
||||||
|
Arrays.sort(flags);
|
||||||
|
for (int i = 1; i < flags.length; i++) {
|
||||||
|
if (flags[i] == flags[i - 1]) {
|
||||||
|
return deduplicate(flags);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static char[] deduplicate(char[] flags) {
|
||||||
|
Set<Character> set = new HashSet<>();
|
||||||
|
for (char flag : flags) {
|
||||||
|
set.add(flag);
|
||||||
|
}
|
||||||
|
return toSortedCharArray(set);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected boolean canStemToOriginal(AffixedWord derived) {
|
||||||
String word = derived.getWord();
|
String word = derived.getWord();
|
||||||
char[] chars = word.toCharArray();
|
char[] chars = word.toCharArray();
|
||||||
if (isForbiddenWord(chars, 0, chars.length)) {
|
if (isForbiddenWord(chars, 0, chars.length)) {
|
||||||
|
@ -190,26 +212,20 @@ public class WordFormGenerator {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static LinkedHashSet<Character> toSet(char[] flags) {
|
private List<AffixedWord> expand(AffixedWord stem, char[] flags, Runnable checkCanceled) {
|
||||||
LinkedHashSet<Character> set = new LinkedHashSet<>();
|
List<AffixedWord> result = new ArrayList<>();
|
||||||
for (char c : flags) {
|
for (char flag : flags) {
|
||||||
set.add(c);
|
|
||||||
}
|
|
||||||
return set;
|
|
||||||
}
|
|
||||||
|
|
||||||
private LinkedHashSet<AffixedWord> expand(
|
|
||||||
AffixedWord stem, LinkedHashSet<Character> flags, Runnable checkCanceled) {
|
|
||||||
LinkedHashSet<AffixedWord> result = new LinkedHashSet<>();
|
|
||||||
for (Character flag : flags) {
|
|
||||||
List<AffixEntry> entries = affixes.get(flag);
|
List<AffixEntry> entries = affixes.get(flag);
|
||||||
if (entries == null) continue;
|
if (entries == null) continue;
|
||||||
|
|
||||||
|
AffixKind kind = entries.get(0).kind;
|
||||||
|
if (!isCompatibleWithPreviousAffixes(stem, kind, flag)) continue;
|
||||||
|
|
||||||
for (AffixEntry affix : entries) {
|
for (AffixEntry affix : entries) {
|
||||||
checkCanceled.run();
|
checkCanceled.run();
|
||||||
AffixedWord derived = affix.apply(stem, dictionary);
|
AffixedWord derived = affix.apply(stem, dictionary);
|
||||||
if (derived != null) {
|
if (derived != null) {
|
||||||
LinkedHashSet<Character> append = appendFlags(affix);
|
char[] append = appendFlags(affix);
|
||||||
if (shouldConsiderAtAll(append)) {
|
if (shouldConsiderAtAll(append)) {
|
||||||
if (canStemToOriginal(derived)) {
|
if (canStemToOriginal(derived)) {
|
||||||
result.add(derived);
|
result.add(derived);
|
||||||
|
@ -224,25 +240,37 @@ public class WordFormGenerator {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean shouldConsiderAtAll(Set<Character> flags) {
|
private boolean shouldConsiderAtAll(char[] flags) {
|
||||||
return !flags.contains(dictionary.compoundBegin)
|
for (char flag : flags) {
|
||||||
&& !flags.contains(dictionary.compoundMiddle)
|
if (flag == dictionary.compoundBegin
|
||||||
&& !flags.contains(dictionary.compoundEnd)
|
|| flag == dictionary.compoundMiddle
|
||||||
&& !flags.contains(dictionary.forbiddenword)
|
|| flag == dictionary.compoundEnd
|
||||||
&& !flags.contains(dictionary.onlyincompound);
|
|| flag == dictionary.forbiddenword
|
||||||
|
|| flag == dictionary.onlyincompound) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private LinkedHashSet<Character> updateFlags(
|
private char[] updateFlags(char[] flags, char toRemove, char[] toAppend) {
|
||||||
Set<Character> flags, Character toRemove, Set<Character> toAppend) {
|
char[] result = new char[flags.length + toAppend.length - 1];
|
||||||
LinkedHashSet<Character> copy = new LinkedHashSet<>(flags);
|
int index = 0;
|
||||||
copy.remove(toRemove);
|
for (char flag : flags) {
|
||||||
copy.addAll(toAppend);
|
if (flag != toRemove && flag != dictionary.needaffix) {
|
||||||
return copy;
|
result[index++] = flag;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (char flag : toAppend) {
|
||||||
|
result[index++] = flag;
|
||||||
|
}
|
||||||
|
return sortAndDeduplicate(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
private LinkedHashSet<Character> appendFlags(AffixEntry affix) {
|
private char[] appendFlags(AffixEntry affix) {
|
||||||
char appendId = dictionary.affixData(affix.id, AFFIX_APPEND);
|
char appendId = dictionary.affixData(affix.id, AFFIX_APPEND);
|
||||||
return appendId <= 0 ? new LinkedHashSet<>() : toSet(dictionary.flagLookup.getFlags(appendId));
|
return appendId == 0 ? new char[0] : dictionary.flagLookup.getFlags(appendId);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -272,9 +300,8 @@ public class WordFormGenerator {
|
||||||
|
|
||||||
private record AffixEntry(
|
private record AffixEntry(
|
||||||
int id, char flag, AffixKind kind, String affix, String strip, AffixCondition condition) {
|
int id, char flag, AffixKind kind, String affix, String strip, AffixCondition condition) {
|
||||||
AffixedWord apply(AffixedWord stem, Dictionary dictionary) {
|
|
||||||
if (!isCompatibleWithPreviousAffixes(stem, dictionary)) return null;
|
|
||||||
|
|
||||||
|
AffixedWord apply(AffixedWord stem, Dictionary dictionary) {
|
||||||
String word = stem.getWord();
|
String word = stem.getWord();
|
||||||
boolean isPrefix = kind == AffixKind.PREFIX;
|
boolean isPrefix = kind == AffixKind.PREFIX;
|
||||||
if (!(isPrefix ? word.startsWith(strip) : word.endsWith(strip))) return null;
|
if (!(isPrefix ? word.startsWith(strip) : word.endsWith(strip))) return null;
|
||||||
|
@ -286,24 +313,24 @@ public class WordFormGenerator {
|
||||||
if (!condition.acceptsStem(stripped)) return null;
|
if (!condition.acceptsStem(stripped)) return null;
|
||||||
|
|
||||||
String applied = isPrefix ? affix + stripped : stripped + affix;
|
String applied = isPrefix ? affix + stripped : stripped + affix;
|
||||||
List<Affix> prefixes = new ArrayList<>(stem.getPrefixes());
|
List<Affix> prefixes = isPrefix ? new ArrayList<>(stem.getPrefixes()) : stem.getPrefixes();
|
||||||
List<Affix> suffixes = new ArrayList<>(stem.getSuffixes());
|
List<Affix> suffixes = isPrefix ? stem.getSuffixes() : new ArrayList<>(stem.getSuffixes());
|
||||||
(isPrefix ? prefixes : suffixes).add(0, new Affix(dictionary, id));
|
(isPrefix ? prefixes : suffixes).add(0, new Affix(dictionary, id));
|
||||||
return new AffixedWord(applied, stem.getDictEntry(), prefixes, suffixes);
|
return new AffixedWord(applied, stem.getDictEntry(), prefixes, suffixes);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private boolean isCompatibleWithPreviousAffixes(AffixedWord stem, Dictionary dictionary) {
|
private boolean isCompatibleWithPreviousAffixes(AffixedWord stem, AffixKind kind, char flag) {
|
||||||
boolean isPrefix = kind == AffixKind.PREFIX;
|
boolean isPrefix = kind == AffixKind.PREFIX;
|
||||||
List<Affix> sameAffixes = isPrefix ? stem.getPrefixes() : stem.getSuffixes();
|
List<Affix> sameAffixes = isPrefix ? stem.getPrefixes() : stem.getSuffixes();
|
||||||
if (sameAffixes.size() == 2) return false;
|
int size = sameAffixes.size();
|
||||||
if (isPrefix && sameAffixes.size() == 1 && !dictionary.complexPrefixes) return false;
|
if (size == 2) return false;
|
||||||
if (!isPrefix && !stem.getPrefixes().isEmpty()) return false;
|
if (isPrefix && size == 1 && !dictionary.complexPrefixes) return false;
|
||||||
if (sameAffixes.size() == 1
|
if (!isPrefix && !stem.getPrefixes().isEmpty()) return false;
|
||||||
&& !dictionary.isFlagAppendedByAffix(sameAffixes.get(0).affixId, flag)) {
|
if (size == 1 && !dictionary.isFlagAppendedByAffix(sameAffixes.get(0).affixId, flag)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private class WordCompressor {
|
private class WordCompressor {
|
||||||
|
|
Loading…
Reference in New Issue