LUCENE-9801: Hunspell suggestions: speed up expandWord by enumerating only applicable affixes (#2416)

This commit is contained in:
Peter Gromov 2021-02-23 05:25:21 +01:00 committed by GitHub
parent af49df4851
commit 34993c22dd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 89 additions and 84 deletions

View File

@ -297,29 +297,29 @@ public class Dictionary {
final FST.BytesReader bytesReader = fst.getBytesReader();
final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<>());
// Accumulate output as we go
final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
IntsRef output = NO_OUTPUT;
IntsRef output = fst.outputs.getNoOutput();
int l = offset + length;
try {
for (int i = offset, cp; i < l; i += Character.charCount(cp)) {
cp = Character.codePointAt(word, i, l);
if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
return null;
} else if (arc.output() != NO_OUTPUT) {
output = fst.outputs.add(output, arc.output());
}
}
if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
for (int i = offset, cp; i < l; i += Character.charCount(cp)) {
cp = Character.codePointAt(word, i, l);
output = nextArc(fst, arc, bytesReader, output, cp);
if (output == null) {
return null;
}
}
return nextArc(fst, arc, bytesReader, output, FST.END_LABEL);
}
static IntsRef nextArc(
FST<IntsRef> fst, FST.Arc<IntsRef> arc, FST.BytesReader reader, IntsRef output, int ch) {
try {
if (fst.findTargetArc(ch, arc, arc, reader) == null) {
return null;
} else if (arc.output() != NO_OUTPUT) {
return fst.outputs.add(output, arc.output());
} else {
return output;
}
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
return fst.outputs.add(output, arc.output());
}
/**

View File

@ -175,67 +175,85 @@ class GeneratingSuggester {
}
// suffixes
processFST(
dictionary.suffixes,
(key, ids) -> {
String suffix = new StringBuilder(toString(key)).reverse().toString();
if (misspelled.length() <= suffix.length() || !misspelled.endsWith(suffix)) return;
processAffixes(
false,
misspelled,
(suffixLength, suffixId) -> {
if (!hasCompatibleFlags(root, suffixId) || !checkAffixCondition(suffixId, root.word)) {
return;
}
for (int i = 0; i < ids.length; i++) {
int suffixId = ids.ints[ids.offset + i];
if (!hasCompatibleFlags(root, suffixId) || !checkAffixCondition(suffixId, root.word)) {
continue;
}
String withSuffix =
root.word.substring(0, root.word.length() - affixStripLength(suffixId)) + suffix;
result.add(withSuffix);
if (dictionary.isCrossProduct(suffixId)) {
crossProducts.add(withSuffix);
}
String suffix = misspelled.substring(misspelled.length() - suffixLength);
String withSuffix =
root.word.substring(0, root.word.length() - affixStripLength(suffixId)) + suffix;
result.add(withSuffix);
if (dictionary.isCrossProduct(suffixId)) {
crossProducts.add(withSuffix);
}
});
// cross-product prefixes
processFST(
dictionary.prefixes,
(key, ids) -> {
String prefix = toString(key);
if (misspelled.length() <= prefix.length() || !misspelled.startsWith(prefix)) return;
processAffixes(
true,
misspelled,
(prefixLength, prefixId) -> {
if (!dictionary.hasFlag(root.entryId, dictionary.affixData(prefixId, AFFIX_FLAG))
|| !dictionary.isCrossProduct(prefixId)) {
return;
}
for (int i = 0; i < ids.length; i++) {
int prefixId = ids.ints[ids.offset + i];
if (!dictionary.hasFlag(root.entryId, dictionary.affixData(prefixId, AFFIX_FLAG))
|| !dictionary.isCrossProduct(prefixId)) {
continue;
}
for (String suffixed : crossProducts) {
if (checkAffixCondition(prefixId, suffixed)) {
result.add(prefix + suffixed.substring(affixStripLength(prefixId)));
}
String prefix = misspelled.substring(0, prefixLength);
for (String suffixed : crossProducts) {
if (checkAffixCondition(prefixId, suffixed)) {
result.add(prefix + suffixed.substring(affixStripLength(prefixId)));
}
}
});
// pure prefixes
processFST(
dictionary.prefixes,
(key, ids) -> {
String prefix = toString(key);
if (misspelled.length() <= prefix.length() || !misspelled.startsWith(prefix)) return;
for (int i = 0; i < ids.length; i++) {
int prefixId = ids.ints[ids.offset + i];
if (hasCompatibleFlags(root, prefixId) && checkAffixCondition(prefixId, root.word)) {
result.add(prefix + root.word.substring(affixStripLength(prefixId)));
}
processAffixes(
true,
misspelled,
(prefixLength, prefixId) -> {
if (hasCompatibleFlags(root, prefixId) && checkAffixCondition(prefixId, root.word)) {
String prefix = misspelled.substring(0, prefixLength);
result.add(prefix + root.word.substring(affixStripLength(prefixId)));
}
});
return result.stream().limit(MAX_WORDS).collect(Collectors.toList());
}
private void processAffixes(boolean prefixes, String word, AffixProcessor processor) {
FST<IntsRef> fst = prefixes ? dictionary.prefixes : dictionary.suffixes;
if (fst == null) return;
FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<>());
FST.BytesReader reader = fst.getBytesReader();
IntsRef output = fst.outputs.getNoOutput();
int length = word.length();
int step = prefixes ? 1 : -1;
int limit = prefixes ? length : -1;
for (int i = prefixes ? 0 : length - 1; i != limit; i += step) {
output = Dictionary.nextArc(fst, arc, reader, output, word.charAt(i));
if (output == null) {
break;
}
if (arc.isFinal()) {
IntsRef affixIds = fst.outputs.add(output, arc.nextFinalOutput());
for (int j = 0; j < affixIds.length; j++) {
processor.processAffix(prefixes ? i + 1 : length - i, affixIds.ints[affixIds.offset + j]);
}
}
}
}
private interface AffixProcessor {
void processAffix(int affixLength, int affixId);
}
private boolean hasCompatibleFlags(Root<?> root, int affixId) {
if (!dictionary.hasFlag(root.entryId, dictionary.affixData(affixId, AFFIX_FLAG))) {
return false;

View File

@ -16,7 +16,6 @@
*/
package org.apache.lucene.analysis.hunspell;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
@ -259,12 +258,8 @@ final class Stemmer {
}
}
}
try {
return stem(
word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
return stem(
word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor);
}
/**
@ -373,22 +368,18 @@ final class Stemmer {
int recursionDepth,
boolean doPrefix,
boolean previousWasPrefix,
RootProcessor processor)
throws IOException {
RootProcessor processor) {
if (doPrefix && dictionary.prefixes != null) {
FST<IntsRef> fst = dictionary.prefixes;
FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
fst.getFirstArc(arc);
IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
IntsRef output = NO_OUTPUT;
IntsRef output = fst.outputs.getNoOutput();
int limit = dictionary.fullStrip ? length + 1 : length;
for (int i = 0; i < limit; i++) {
if (i > 0) {
char ch = word[offset + i - 1];
if (fst.findTargetArc(ch, arc, arc, prefixReader) == null) {
output = Dictionary.nextArc(fst, arc, prefixReader, output, word[offset + i - 1]);
if (output == null) {
break;
} else if (arc.output() != NO_OUTPUT) {
output = fst.outputs.add(output, arc.output());
}
}
if (!arc.isFinal()) {
@ -431,16 +422,13 @@ final class Stemmer {
FST<IntsRef> fst = dictionary.suffixes;
FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
fst.getFirstArc(arc);
IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
IntsRef output = NO_OUTPUT;
IntsRef output = fst.outputs.getNoOutput();
int limit = dictionary.fullStrip ? 0 : 1;
for (int i = length; i >= limit; i--) {
if (i < length) {
char ch = word[offset + i];
if (fst.findTargetArc(ch, arc, arc, suffixReader) == null) {
output = Dictionary.nextArc(fst, arc, suffixReader, output, word[offset + i]);
if (output == null) {
break;
} else if (arc.output() != NO_OUTPUT) {
output = fst.outputs.add(output, arc.output());
}
}
if (!arc.isFinal()) {
@ -610,8 +598,7 @@ final class Stemmer {
int prefixId,
int recursionDepth,
boolean prefix,
RootProcessor processor)
throws IOException {
RootProcessor processor) {
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix, prefixId);

View File

@ -82,7 +82,7 @@ public class TestPerformance extends LuceneTestCase {
@Test
public void fr_suggest() throws Exception {
checkSuggestionPerformance("fr", 100);
checkSuggestionPerformance("fr", 120);
}
private Dictionary loadDictionary(String code) throws IOException, ParseException {