mirror of
https://github.com/apache/lucene.git
synced 2025-02-28 21:39:25 +00:00
LUCENE-9801: Hunspell suggestions: speed up expandWord by enumerating only applicable affixes (#2416)
This commit is contained in:
parent
af49df4851
commit
34993c22dd
@ -297,29 +297,29 @@ public class Dictionary {
|
||||
final FST.BytesReader bytesReader = fst.getBytesReader();
|
||||
final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<>());
|
||||
// Accumulate output as we go
|
||||
final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
|
||||
IntsRef output = NO_OUTPUT;
|
||||
IntsRef output = fst.outputs.getNoOutput();
|
||||
|
||||
int l = offset + length;
|
||||
try {
|
||||
for (int i = offset, cp; i < l; i += Character.charCount(cp)) {
|
||||
cp = Character.codePointAt(word, i, l);
|
||||
if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
|
||||
return null;
|
||||
} else if (arc.output() != NO_OUTPUT) {
|
||||
output = fst.outputs.add(output, arc.output());
|
||||
}
|
||||
}
|
||||
if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
|
||||
for (int i = offset, cp; i < l; i += Character.charCount(cp)) {
|
||||
cp = Character.codePointAt(word, i, l);
|
||||
output = nextArc(fst, arc, bytesReader, output, cp);
|
||||
if (output == null) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return nextArc(fst, arc, bytesReader, output, FST.END_LABEL);
|
||||
}
|
||||
|
||||
static IntsRef nextArc(
|
||||
FST<IntsRef> fst, FST.Arc<IntsRef> arc, FST.BytesReader reader, IntsRef output, int ch) {
|
||||
try {
|
||||
if (fst.findTargetArc(ch, arc, arc, reader) == null) {
|
||||
return null;
|
||||
} else if (arc.output() != NO_OUTPUT) {
|
||||
return fst.outputs.add(output, arc.output());
|
||||
} else {
|
||||
return output;
|
||||
}
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
return fst.outputs.add(output, arc.output());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -175,67 +175,85 @@ class GeneratingSuggester {
|
||||
}
|
||||
|
||||
// suffixes
|
||||
processFST(
|
||||
dictionary.suffixes,
|
||||
(key, ids) -> {
|
||||
String suffix = new StringBuilder(toString(key)).reverse().toString();
|
||||
if (misspelled.length() <= suffix.length() || !misspelled.endsWith(suffix)) return;
|
||||
processAffixes(
|
||||
false,
|
||||
misspelled,
|
||||
(suffixLength, suffixId) -> {
|
||||
if (!hasCompatibleFlags(root, suffixId) || !checkAffixCondition(suffixId, root.word)) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < ids.length; i++) {
|
||||
int suffixId = ids.ints[ids.offset + i];
|
||||
if (!hasCompatibleFlags(root, suffixId) || !checkAffixCondition(suffixId, root.word)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String withSuffix =
|
||||
root.word.substring(0, root.word.length() - affixStripLength(suffixId)) + suffix;
|
||||
result.add(withSuffix);
|
||||
if (dictionary.isCrossProduct(suffixId)) {
|
||||
crossProducts.add(withSuffix);
|
||||
}
|
||||
String suffix = misspelled.substring(misspelled.length() - suffixLength);
|
||||
String withSuffix =
|
||||
root.word.substring(0, root.word.length() - affixStripLength(suffixId)) + suffix;
|
||||
result.add(withSuffix);
|
||||
if (dictionary.isCrossProduct(suffixId)) {
|
||||
crossProducts.add(withSuffix);
|
||||
}
|
||||
});
|
||||
|
||||
// cross-product prefixes
|
||||
processFST(
|
||||
dictionary.prefixes,
|
||||
(key, ids) -> {
|
||||
String prefix = toString(key);
|
||||
if (misspelled.length() <= prefix.length() || !misspelled.startsWith(prefix)) return;
|
||||
processAffixes(
|
||||
true,
|
||||
misspelled,
|
||||
(prefixLength, prefixId) -> {
|
||||
if (!dictionary.hasFlag(root.entryId, dictionary.affixData(prefixId, AFFIX_FLAG))
|
||||
|| !dictionary.isCrossProduct(prefixId)) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < ids.length; i++) {
|
||||
int prefixId = ids.ints[ids.offset + i];
|
||||
if (!dictionary.hasFlag(root.entryId, dictionary.affixData(prefixId, AFFIX_FLAG))
|
||||
|| !dictionary.isCrossProduct(prefixId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (String suffixed : crossProducts) {
|
||||
if (checkAffixCondition(prefixId, suffixed)) {
|
||||
result.add(prefix + suffixed.substring(affixStripLength(prefixId)));
|
||||
}
|
||||
String prefix = misspelled.substring(0, prefixLength);
|
||||
for (String suffixed : crossProducts) {
|
||||
if (checkAffixCondition(prefixId, suffixed)) {
|
||||
result.add(prefix + suffixed.substring(affixStripLength(prefixId)));
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// pure prefixes
|
||||
processFST(
|
||||
dictionary.prefixes,
|
||||
(key, ids) -> {
|
||||
String prefix = toString(key);
|
||||
if (misspelled.length() <= prefix.length() || !misspelled.startsWith(prefix)) return;
|
||||
|
||||
for (int i = 0; i < ids.length; i++) {
|
||||
int prefixId = ids.ints[ids.offset + i];
|
||||
if (hasCompatibleFlags(root, prefixId) && checkAffixCondition(prefixId, root.word)) {
|
||||
result.add(prefix + root.word.substring(affixStripLength(prefixId)));
|
||||
}
|
||||
processAffixes(
|
||||
true,
|
||||
misspelled,
|
||||
(prefixLength, prefixId) -> {
|
||||
if (hasCompatibleFlags(root, prefixId) && checkAffixCondition(prefixId, root.word)) {
|
||||
String prefix = misspelled.substring(0, prefixLength);
|
||||
result.add(prefix + root.word.substring(affixStripLength(prefixId)));
|
||||
}
|
||||
});
|
||||
|
||||
return result.stream().limit(MAX_WORDS).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private void processAffixes(boolean prefixes, String word, AffixProcessor processor) {
|
||||
FST<IntsRef> fst = prefixes ? dictionary.prefixes : dictionary.suffixes;
|
||||
if (fst == null) return;
|
||||
|
||||
FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<>());
|
||||
FST.BytesReader reader = fst.getBytesReader();
|
||||
|
||||
IntsRef output = fst.outputs.getNoOutput();
|
||||
int length = word.length();
|
||||
int step = prefixes ? 1 : -1;
|
||||
int limit = prefixes ? length : -1;
|
||||
for (int i = prefixes ? 0 : length - 1; i != limit; i += step) {
|
||||
output = Dictionary.nextArc(fst, arc, reader, output, word.charAt(i));
|
||||
if (output == null) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (arc.isFinal()) {
|
||||
IntsRef affixIds = fst.outputs.add(output, arc.nextFinalOutput());
|
||||
for (int j = 0; j < affixIds.length; j++) {
|
||||
processor.processAffix(prefixes ? i + 1 : length - i, affixIds.ints[affixIds.offset + j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private interface AffixProcessor {
|
||||
void processAffix(int affixLength, int affixId);
|
||||
}
|
||||
|
||||
private boolean hasCompatibleFlags(Root<?> root, int affixId) {
|
||||
if (!dictionary.hasFlag(root.entryId, dictionary.affixData(affixId, AFFIX_FLAG))) {
|
||||
return false;
|
||||
|
@ -16,7 +16,6 @@
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
@ -259,12 +258,8 @@ final class Stemmer {
|
||||
}
|
||||
}
|
||||
}
|
||||
try {
|
||||
return stem(
|
||||
word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor);
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
return stem(
|
||||
word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -373,22 +368,18 @@ final class Stemmer {
|
||||
int recursionDepth,
|
||||
boolean doPrefix,
|
||||
boolean previousWasPrefix,
|
||||
RootProcessor processor)
|
||||
throws IOException {
|
||||
RootProcessor processor) {
|
||||
if (doPrefix && dictionary.prefixes != null) {
|
||||
FST<IntsRef> fst = dictionary.prefixes;
|
||||
FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
|
||||
fst.getFirstArc(arc);
|
||||
IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
|
||||
IntsRef output = NO_OUTPUT;
|
||||
IntsRef output = fst.outputs.getNoOutput();
|
||||
int limit = dictionary.fullStrip ? length + 1 : length;
|
||||
for (int i = 0; i < limit; i++) {
|
||||
if (i > 0) {
|
||||
char ch = word[offset + i - 1];
|
||||
if (fst.findTargetArc(ch, arc, arc, prefixReader) == null) {
|
||||
output = Dictionary.nextArc(fst, arc, prefixReader, output, word[offset + i - 1]);
|
||||
if (output == null) {
|
||||
break;
|
||||
} else if (arc.output() != NO_OUTPUT) {
|
||||
output = fst.outputs.add(output, arc.output());
|
||||
}
|
||||
}
|
||||
if (!arc.isFinal()) {
|
||||
@ -431,16 +422,13 @@ final class Stemmer {
|
||||
FST<IntsRef> fst = dictionary.suffixes;
|
||||
FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
|
||||
fst.getFirstArc(arc);
|
||||
IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
|
||||
IntsRef output = NO_OUTPUT;
|
||||
IntsRef output = fst.outputs.getNoOutput();
|
||||
int limit = dictionary.fullStrip ? 0 : 1;
|
||||
for (int i = length; i >= limit; i--) {
|
||||
if (i < length) {
|
||||
char ch = word[offset + i];
|
||||
if (fst.findTargetArc(ch, arc, arc, suffixReader) == null) {
|
||||
output = Dictionary.nextArc(fst, arc, suffixReader, output, word[offset + i]);
|
||||
if (output == null) {
|
||||
break;
|
||||
} else if (arc.output() != NO_OUTPUT) {
|
||||
output = fst.outputs.add(output, arc.output());
|
||||
}
|
||||
}
|
||||
if (!arc.isFinal()) {
|
||||
@ -610,8 +598,7 @@ final class Stemmer {
|
||||
int prefixId,
|
||||
int recursionDepth,
|
||||
boolean prefix,
|
||||
RootProcessor processor)
|
||||
throws IOException {
|
||||
RootProcessor processor) {
|
||||
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
|
||||
|
||||
boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix, prefixId);
|
||||
|
@ -82,7 +82,7 @@ public class TestPerformance extends LuceneTestCase {
|
||||
|
||||
@Test
|
||||
public void fr_suggest() throws Exception {
|
||||
checkSuggestionPerformance("fr", 100);
|
||||
checkSuggestionPerformance("fr", 120);
|
||||
}
|
||||
|
||||
private Dictionary loadDictionary(String code) throws IOException, ParseException {
|
||||
|
Loading…
x
Reference in New Issue
Block a user