hunspell: speed up "compress"; minimize the number of the generated entries; don't even consider "forbidden" entries anymore (#13429)

Hunspell: speed up "compress"; minimize the number of the generated entries; don't even consider "forbidden" entries anymore
This commit is contained in:
Peter Gromov 2024-05-28 18:09:40 +02:00 committed by GitHub
parent 4438404457
commit 54d3ff64bc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 157 additions and 54 deletions

View File

@ -309,6 +309,8 @@ Improvements
* GITHUB#13276: UnifiedHighlighter: new 'passageSortComparator' option to allow sorting other than offset order. (Seunghan Jung)
* GITHUB#13429: Hunspell: speed up "compress"; minimize the number of the generated entries; don't even consider "forbidden" entries anymore (Peter Gromov)
Optimizations
---------------------

View File

@ -384,27 +384,30 @@ public class WordFormGenerator {
private class WordCompressor {
private final Comparator<State> solutionFitness =
Comparator.comparingInt((State s) -> s.forbidden)
.thenComparingInt(s -> s.underGenerated)
Comparator.comparingInt((State s) -> -s.potentialCoverage)
.thenComparingInt(s -> s.stemToFlags.size())
.thenComparingInt(s -> s.underGenerated)
.thenComparingInt(s -> s.overGenerated);
private final Set<String> forbidden;
private final Runnable checkCanceled;
private final Set<String> wordSet;
private final Set<String> existingStems;
private final Map<String, Set<FlagSet>> stemToPossibleFlags = new HashMap<>();
private final Map<String, Integer> stemCounts = new LinkedHashMap<>();
private final Map<String, Set<String>> stemsToForms = new LinkedHashMap<>();
WordCompressor(List<String> words, Set<String> forbidden, Runnable checkCanceled) {
this.forbidden = forbidden;
this.checkCanceled = checkCanceled;
wordSet = new HashSet<>(words);
Stemmer.StemCandidateProcessor processor =
for (String word : words) {
checkCanceled.run();
stemToPossibleFlags.computeIfAbsent(word, __ -> new LinkedHashSet<>());
var processor =
new Stemmer.StemCandidateProcessor(WordContext.SIMPLE_WORD) {
@Override
boolean processStemCandidate(
char[] word,
char[] chars,
int offset,
int length,
int lastAffix,
@ -412,29 +415,34 @@ public class WordFormGenerator {
int innerPrefix,
int outerSuffix,
int innerSuffix) {
String candidate = new String(word, offset, length);
stemCounts.merge(candidate, 1, Integer::sum);
String candidate = new String(chars, offset, length);
CharHashSet flags = new CharHashSet();
if (outerPrefix >= 0) flags.add(dictionary.affixData(outerPrefix, AFFIX_FLAG));
if (innerPrefix >= 0) flags.add(dictionary.affixData(innerPrefix, AFFIX_FLAG));
if (outerSuffix >= 0) flags.add(dictionary.affixData(outerSuffix, AFFIX_FLAG));
if (innerSuffix >= 0) flags.add(dictionary.affixData(innerSuffix, AFFIX_FLAG));
FlagSet flagSet = new FlagSet(flags, dictionary);
StemWithFlags swf = new StemWithFlags(candidate, Set.of(flagSet));
if (forbidden.isEmpty()
|| allGenerated(swf).stream().noneMatch(forbidden::contains)) {
registerStem(candidate);
stemToPossibleFlags
.computeIfAbsent(candidate, __ -> new LinkedHashSet<>())
.add(new FlagSet(flags, dictionary));
.add(flagSet);
}
return true;
}
};
for (String word : words) {
checkCanceled.run();
stemCounts.merge(word, 1, Integer::sum);
stemToPossibleFlags.computeIfAbsent(word, __ -> new LinkedHashSet<>());
void registerStem(String stem) {
stemsToForms.computeIfAbsent(stem, __ -> new LinkedHashSet<>()).add(word);
}
};
processor.registerStem(word);
stemmer.removeAffixes(word.toCharArray(), 0, word.length(), true, -1, -1, -1, processor);
}
existingStems =
stemCounts.keySet().stream()
stemsToForms.keySet().stream()
.filter(stem -> dictionary.lookupEntries(stem) != null)
.collect(Collectors.toSet());
}
@ -442,30 +450,49 @@ public class WordFormGenerator {
EntrySuggestion compress() {
Comparator<String> stemSorter =
Comparator.comparing((String s) -> existingStems.contains(s))
.thenComparing(stemCounts::get)
.thenComparing(s -> stemsToForms.get(s).size())
.reversed();
List<String> sortedStems = stemCounts.keySet().stream().sorted(stemSorter).toList();
List<String> sortedStems = stemsToForms.keySet().stream().sorted(stemSorter).toList();
PriorityQueue<State> queue = new PriorityQueue<>(solutionFitness);
Set<Map<String, Set<FlagSet>>> visited = new HashSet<>();
queue.offer(new State(Map.of(), wordSet.size(), 0, 0));
State result = null;
while (!queue.isEmpty()) {
State state = queue.poll();
if (state.underGenerated == 0) {
if (result == null || solutionFitness.compare(state, result) < 0) result = state;
if (state.forbidden == 0) break;
continue;
result = state;
break;
}
for (String stem : sortedStems) {
if (!state.stemToFlags.containsKey(stem)) {
queue.offer(addStem(state, stem));
var withStem = addStem(state, stem);
if (visited.add(withStem)) {
var next = newState(withStem);
if (next != null
&& (state.underGenerated > next.underGenerated
|| next.potentialCoverage > state.potentialCoverage)) {
queue.offer(next);
}
}
}
}
if (state.potentialCoverage < wordSet.size()) {
// don't add flags until the suggested entries can potentially cover all requested forms
continue;
}
for (Map.Entry<String, Set<FlagSet>> entry : state.stemToFlags.entrySet()) {
for (FlagSet flags : stemToPossibleFlags.get(entry.getKey())) {
if (!entry.getValue().contains(flags)) {
queue.offer(addFlags(state, entry.getKey(), flags));
var withFlags = addFlags(state, entry.getKey(), flags);
if (visited.add(withFlags)) {
var next = newState(withFlags);
if (next != null && state.underGenerated > next.underGenerated) {
queue.offer(next);
}
}
}
}
}
@ -482,7 +509,7 @@ public class WordFormGenerator {
List<String> extraGenerated = new ArrayList<>();
for (String extra : allGenerated(state.stemToFlags).distinct().sorted().toList()) {
if (wordSet.contains(extra)) continue;
if (wordSet.contains(extra) || existingStems.contains(extra)) continue;
if (forbidden.contains(extra) && dictionary.forbiddenword != FLAG_UNSET) {
addEntry(toEdit, toAdd, extra, CharHashSet.from(dictionary.forbiddenword));
@ -500,39 +527,55 @@ public class WordFormGenerator {
(existingStems.contains(stem) ? toEdit : toAdd).add(DictEntry.create(stem, flagString));
}
private State addStem(State state, String stem) {
LinkedHashMap<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
private Map<String, Set<FlagSet>> addStem(State state, String stem) {
Map<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
stemToFlags.put(stem, Set.of());
return newState(stemToFlags);
return stemToFlags;
}
private State addFlags(State state, String stem, FlagSet flags) {
LinkedHashMap<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
private Map<String, Set<FlagSet>> addFlags(State state, String stem, FlagSet flags) {
Map<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
Set<FlagSet> flagSets = new LinkedHashSet<>(stemToFlags.get(stem));
flagSets.add(flags);
stemToFlags.put(stem, flagSets);
return newState(stemToFlags);
return stemToFlags;
}
private State newState(Map<String, Set<FlagSet>> stemToFlags) {
Set<String> allGenerated = allGenerated(stemToFlags).collect(Collectors.toSet());
int overGenerated = 0;
for (String s : allGenerated) {
if (forbidden.contains(s)) return null;
if (!wordSet.contains(s)) overGenerated++;
}
int potentialCoverage =
(int)
stemToFlags.keySet().stream()
.flatMap(s -> stemsToForms.get(s).stream())
.distinct()
.count();
return new State(
stemToFlags,
(int) wordSet.stream().filter(s -> !allGenerated.contains(s)).count(),
(int) allGenerated.stream().filter(s -> !wordSet.contains(s)).count(),
(int) allGenerated.stream().filter(s -> forbidden.contains(s)).count());
overGenerated,
potentialCoverage);
}
private final Map<StemWithFlags, List<String>> expansionCache = new HashMap<>();
private record StemWithFlags(String stem, Set<FlagSet> flags) {}
private Stream<String> allGenerated(Map<String, Set<FlagSet>> stemToFlags) {
private List<String> allGenerated(StemWithFlags swc) {
Function<StemWithFlags, List<String>> expandToWords =
e -> expand(e.stem, FlagSet.flatten(e.flags)).stream().map(w -> w.getWord()).toList();
return expansionCache.computeIfAbsent(swc, expandToWords);
}
private Stream<String> allGenerated(Map<String, Set<FlagSet>> stemToFlags) {
return stemToFlags.entrySet().stream()
.map(e -> new StemWithFlags(e.getKey(), e.getValue()))
.flatMap(swc -> expansionCache.computeIfAbsent(swc, expandToWords).stream());
.flatMap(
entry -> allGenerated(new StemWithFlags(entry.getKey(), entry.getValue())).stream());
}
private List<AffixedWord> expand(String stem, CharHashSet flagSet) {
@ -561,5 +604,7 @@ public class WordFormGenerator {
Map<String, Set<FlagSet>> stemToFlags,
int underGenerated,
int overGenerated,
int forbidden) {}
// The maximum number of requested forms possibly generated by adding only flags to this state
int potentialCoverage) {}
}

View File

@ -213,7 +213,7 @@ public class TestHunspell extends LuceneTestCase {
Hunspell h = loadNoTimeout("base");
String[] createQuery = {"create", "created", "creates", "creating", "creation"};
checkCompression(h, "toEdit=[create/DGNS], toAdd=[], extra=[]", createQuery);
checkCompression(h, "toEdit=[created], toAdd=[creates], extra=[]", "creates", "created");
checkCompression(h, "toEdit=[create/DS], toAdd=[], extra=[]", "creates", "created");
checkCompression(h, "toEdit=[], toAdd=[creation/S], extra=[]", "creation", "creations");
checkCompression(h, "toEdit=[], toAdd=[abc, def], extra=[]", "abc", "def");
checkCompression(h, "toEdit=[], toAdd=[form/S], extra=[]", "form", "forms");
@ -227,6 +227,20 @@ public class TestHunspell extends LuceneTestCase {
Hunspell h = loadNoTimeout("compress");
checkCompression(
h, "toEdit=[], toAdd=[form/GS], extra=[]", "formings", "forming", "form", "forms");
checkCompression(h, "toEdit=[], toAdd=[f/def], extra=[]", "f", "fd", "fe", "ff");
WordFormGenerator gen = new WordFormGenerator(h.dictionary);
EntrySuggestion fAbc =
gen.compress(List.of("f", "fa", "fb", "fc"), Set.of("fyy", "fxx"), () -> {});
assertEquals("toEdit=[], toAdd=[f/abc], extra=[]", fAbc.internalsToString());
}
@Test
public void testCompressingIsFastOnLargeUnrelatedWordSets() throws Exception {
Hunspell h = loadNoTimeout("compress");
String[] letters = {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"};
checkCompression(h, "toEdit=[], toAdd=[a, b, c, d, e, f, g, h, i, j, k, l], extra=[]", letters);
}
@Test

View File

@ -1,5 +1,3 @@
FORBIDDENWORD *
SFX G Y 1
SFX G 0 ing/S .
@ -12,3 +10,47 @@ SFX S 0 s .
SFX X Y 2
SFX X 0 s .
SFX X 0 x .
# Flags for f,fa,fb,fc
SFX A Y 3
SFX A 0 a .
SFX A 0 b .
SFX A 0 yy .
SFX B Y 3
SFX B 0 c .
SFX B 0 b .
SFX B 0 xx .
SFX a Y 1
SFX a 0 a .
SFX b Y 1
SFX b 0 b .
SFX c Y 1
SFX c 0 c .
# Flags for f,fd,fe,ff with red herring -+* flags that bias the greedy heuristics to prefer the "fd" stem initially
SFX d Y 1
SFX d 0 d .
SFX e Y 1
SFX e 0 e .
SFX f Y 1
SFX f 0 f .
SFX - Y 2
SFX - d 0 d
SFX - d e d
SFX + Y 2
SFX + d 0 d
SFX + d e d
SFX * Y 2
SFX * d 0 d
SFX * d e d