From 34993c22ddbac702bfad321a04bbd753e9ade1c4 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Tue, 23 Feb 2021 05:25:21 +0100 Subject: [PATCH] LUCENE-9801: Hunspell suggestions: speed up expandWord by enumerating only applicable affixes (#2416) --- .../lucene/analysis/hunspell/Dictionary.java | 32 +++--- .../hunspell/GeneratingSuggester.java | 106 ++++++++++-------- .../lucene/analysis/hunspell/Stemmer.java | 33 ++---- .../analysis/hunspell/TestPerformance.java | 2 +- 4 files changed, 89 insertions(+), 84 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index a3cbe39a402..115af24614e 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -297,29 +297,29 @@ public class Dictionary { final FST.BytesReader bytesReader = fst.getBytesReader(); final FST.Arc arc = fst.getFirstArc(new FST.Arc<>()); // Accumulate output as we go - final IntsRef NO_OUTPUT = fst.outputs.getNoOutput(); - IntsRef output = NO_OUTPUT; + IntsRef output = fst.outputs.getNoOutput(); int l = offset + length; - try { - for (int i = offset, cp; i < l; i += Character.charCount(cp)) { - cp = Character.codePointAt(word, i, l); - if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) { - return null; - } else if (arc.output() != NO_OUTPUT) { - output = fst.outputs.add(output, arc.output()); - } - } - if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) { + for (int i = offset, cp; i < l; i += Character.charCount(cp)) { + cp = Character.codePointAt(word, i, l); + output = nextArc(fst, arc, bytesReader, output, cp); + if (output == null) { + return null; + } + } + return nextArc(fst, arc, bytesReader, output, FST.END_LABEL); + } + + static IntsRef nextArc( + FST fst, FST.Arc arc, FST.BytesReader reader, IntsRef output, int ch) { + try { + if (fst.findTargetArc(ch, arc, arc, reader) == null) { return null; - } else if (arc.output() != NO_OUTPUT) { - return fst.outputs.add(output, arc.output()); - } else { - return output; } } catch (IOException bogus) { throw new RuntimeException(bogus); } + return fst.outputs.add(output, arc.output()); } /** diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java index e211111a064..500ae155cee 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java @@ -175,67 +175,85 @@ class GeneratingSuggester { } // suffixes - processFST( - dictionary.suffixes, - (key, ids) -> { - String suffix = new StringBuilder(toString(key)).reverse().toString(); - if (misspelled.length() <= suffix.length() || !misspelled.endsWith(suffix)) return; + processAffixes( + false, + misspelled, + (suffixLength, suffixId) -> { + if (!hasCompatibleFlags(root, suffixId) || !checkAffixCondition(suffixId, root.word)) { + return; + } - for (int i = 0; i < ids.length; i++) { - int suffixId = ids.ints[ids.offset + i]; - if (!hasCompatibleFlags(root, suffixId) || !checkAffixCondition(suffixId, root.word)) { - continue; - } - - String withSuffix = - root.word.substring(0, root.word.length() - affixStripLength(suffixId)) + suffix; - result.add(withSuffix); - if (dictionary.isCrossProduct(suffixId)) { - crossProducts.add(withSuffix); - } + String suffix = misspelled.substring(misspelled.length() - suffixLength); + String withSuffix = + root.word.substring(0, root.word.length() - affixStripLength(suffixId)) + suffix; + result.add(withSuffix); + if (dictionary.isCrossProduct(suffixId)) { + crossProducts.add(withSuffix); } }); // cross-product prefixes - processFST( - dictionary.prefixes, - (key, ids) -> { - String prefix = toString(key); - if (misspelled.length() <= prefix.length() || !misspelled.startsWith(prefix)) return; + processAffixes( + true, + misspelled, + (prefixLength, prefixId) -> { + if (!dictionary.hasFlag(root.entryId, dictionary.affixData(prefixId, AFFIX_FLAG)) + || !dictionary.isCrossProduct(prefixId)) { + return; + } - for (int i = 0; i < ids.length; i++) { - int prefixId = ids.ints[ids.offset + i]; - if (!dictionary.hasFlag(root.entryId, dictionary.affixData(prefixId, AFFIX_FLAG)) - || !dictionary.isCrossProduct(prefixId)) { - continue; - } - - for (String suffixed : crossProducts) { - if (checkAffixCondition(prefixId, suffixed)) { - result.add(prefix + suffixed.substring(affixStripLength(prefixId))); - } + String prefix = misspelled.substring(0, prefixLength); + for (String suffixed : crossProducts) { + if (checkAffixCondition(prefixId, suffixed)) { + result.add(prefix + suffixed.substring(affixStripLength(prefixId))); } } }); // pure prefixes - processFST( - dictionary.prefixes, - (key, ids) -> { - String prefix = toString(key); - if (misspelled.length() <= prefix.length() || !misspelled.startsWith(prefix)) return; - - for (int i = 0; i < ids.length; i++) { - int prefixId = ids.ints[ids.offset + i]; - if (hasCompatibleFlags(root, prefixId) && checkAffixCondition(prefixId, root.word)) { - result.add(prefix + root.word.substring(affixStripLength(prefixId))); - } + processAffixes( + true, + misspelled, + (prefixLength, prefixId) -> { + if (hasCompatibleFlags(root, prefixId) && checkAffixCondition(prefixId, root.word)) { + String prefix = misspelled.substring(0, prefixLength); + result.add(prefix + root.word.substring(affixStripLength(prefixId))); } }); return result.stream().limit(MAX_WORDS).collect(Collectors.toList()); } + private void processAffixes(boolean prefixes, String word, AffixProcessor processor) { + FST fst = prefixes ? dictionary.prefixes : dictionary.suffixes; + if (fst == null) return; + + FST.Arc arc = fst.getFirstArc(new FST.Arc<>()); + FST.BytesReader reader = fst.getBytesReader(); + + IntsRef output = fst.outputs.getNoOutput(); + int length = word.length(); + int step = prefixes ? 1 : -1; + int limit = prefixes ? length : -1; + for (int i = prefixes ? 0 : length - 1; i != limit; i += step) { + output = Dictionary.nextArc(fst, arc, reader, output, word.charAt(i)); + if (output == null) { + break; + } + + if (arc.isFinal()) { + IntsRef affixIds = fst.outputs.add(output, arc.nextFinalOutput()); + for (int j = 0; j < affixIds.length; j++) { + processor.processAffix(prefixes ? i + 1 : length - i, affixIds.ints[affixIds.offset + j]); + } + } + } + } + + private interface AffixProcessor { + void processAffix(int affixLength, int affixId); + } + private boolean hasCompatibleFlags(Root root, int affixId) { if (!dictionary.hasFlag(root.entryId, dictionary.affixData(affixId, AFFIX_FLAG))) { return false; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index 2a75f45d6e1..66fac294c0c 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -16,7 +16,6 @@ */ package org.apache.lucene.analysis.hunspell; -import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; @@ -259,12 +258,8 @@ final class Stemmer { } } } - try { - return stem( - word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor); - } catch (IOException bogus) { - throw new RuntimeException(bogus); - } + return stem( + word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor); } /** @@ -373,22 +368,18 @@ final class Stemmer { int recursionDepth, boolean doPrefix, boolean previousWasPrefix, - RootProcessor processor) - throws IOException { + RootProcessor processor) { if (doPrefix && dictionary.prefixes != null) { FST fst = dictionary.prefixes; FST.Arc arc = prefixArcs[recursionDepth]; fst.getFirstArc(arc); - IntsRef NO_OUTPUT = fst.outputs.getNoOutput(); - IntsRef output = NO_OUTPUT; + IntsRef output = fst.outputs.getNoOutput(); int limit = dictionary.fullStrip ? length + 1 : length; for (int i = 0; i < limit; i++) { if (i > 0) { - char ch = word[offset + i - 1]; - if (fst.findTargetArc(ch, arc, arc, prefixReader) == null) { + output = Dictionary.nextArc(fst, arc, prefixReader, output, word[offset + i - 1]); + if (output == null) { break; - } else if (arc.output() != NO_OUTPUT) { - output = fst.outputs.add(output, arc.output()); } } if (!arc.isFinal()) { @@ -431,16 +422,13 @@ final class Stemmer { FST fst = dictionary.suffixes; FST.Arc arc = suffixArcs[recursionDepth]; fst.getFirstArc(arc); - IntsRef NO_OUTPUT = fst.outputs.getNoOutput(); - IntsRef output = NO_OUTPUT; + IntsRef output = fst.outputs.getNoOutput(); int limit = dictionary.fullStrip ? 0 : 1; for (int i = length; i >= limit; i--) { if (i < length) { - char ch = word[offset + i]; - if (fst.findTargetArc(ch, arc, arc, suffixReader) == null) { + output = Dictionary.nextArc(fst, arc, suffixReader, output, word[offset + i]); + if (output == null) { break; - } else if (arc.output() != NO_OUTPUT) { - output = fst.outputs.add(output, arc.output()); } } if (!arc.isFinal()) { @@ -610,8 +598,7 @@ final class Stemmer { int prefixId, int recursionDepth, boolean prefix, - RootProcessor processor) - throws IOException { + RootProcessor processor) { char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG); boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix, prefixId); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java index ca38ca8e746..174ec444e9d 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java @@ -82,7 +82,7 @@ public class TestPerformance extends LuceneTestCase { @Test public void fr_suggest() throws Exception { - checkSuggestionPerformance("fr", 100); + checkSuggestionPerformance("fr", 120); } private Dictionary loadDictionary(String code) throws IOException, ParseException {