diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 31383b4cb8a..4370ca2357c 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -84,6 +84,11 @@ API Changes IndexOutput.getFilePointer instead) and IndexOutput.setLength. (Mike McCandless) +Optimizations + +* LUCENE-5603: hunspell stemmer more efficiently strips prefixes + and suffixes. (Robert Muir) + ======================= Lucene 4.8.0 ======================= System Requirements diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index bcf3be55303..4bf881cce20 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -207,32 +207,16 @@ public class Dictionary { return lookup(words, word, offset, length); } - /** - * Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length - * - * @param word Char array to generate the String from - * @param offset Offset in the char array that the String starts at - * @param length Length from the offset that the String is - * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found - */ + // only for testing IntsRef lookupPrefix(char word[], int offset, int length) { return lookup(prefixes, word, offset, length); } - /** - * Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length - * - * @param word Char array to generate the String from - * @param offset Offset in the char array that the String starts at - * @param length Length from the offset that the String is - * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found - */ + // only for testing IntsRef lookupSuffix(char word[], int offset, int length) { return lookup(suffixes, word, offset, length); } - // TODO: this is pretty stupid, considering how the stemming algorithm works - // we can speed it up to be significantly faster! IntsRef lookup(FST fst, char word[], int offset, int length) { if (fst == null) { return null; @@ -396,6 +380,7 @@ public class Dictionary { String args[] = header.split("\\s+"); boolean crossProduct = args[2].equals("Y"); + boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN; int numLines = Integer.parseInt(args[3]); affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3)); @@ -501,6 +486,10 @@ public class Dictionary { affixArg = cleaned.toString(); } + if (isSuffix) { + affixArg = new StringBuilder(affixArg).reverse().toString(); + } + List list = affixes.get(affixArg); if (list == null) { list = new ArrayList<>(); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index cb33ab48fee..5a5b4fbae8a 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -31,6 +31,8 @@ import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.Version; import org.apache.lucene.util.automaton.CharacterRunAutomaton; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.Outputs; /** * Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word. It @@ -54,6 +56,16 @@ final class Stemmer { public Stemmer(Dictionary dictionary) { this.dictionary = dictionary; this.affixReader = new ByteArrayDataInput(dictionary.affixData); + for (int level = 0; level < 3; level++) { + if (dictionary.prefixes != null) { + prefixArcs[level] = new FST.Arc<>(); + prefixReaders[level] = dictionary.prefixes.getBytesReader(); + } + if (dictionary.suffixes != null) { + suffixArcs[level] = new FST.Arc<>(); + suffixReaders[level] = dictionary.suffixes.getBytesReader(); + } + } } /** @@ -93,7 +105,11 @@ final class Stemmer { stems.add(newStem(word, length)); } } - stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false)); + try { + stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false)); + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } return stems; } @@ -138,6 +154,16 @@ final class Stemmer { // ================================================= Helper Methods ================================================ + // some state for traversing FSTs + final FST.BytesReader prefixReaders[] = new FST.BytesReader[3]; + @SuppressWarnings("unchecked") + final FST.Arc prefixArcs[] = new FST.Arc[3]; + + final FST.BytesReader suffixReaders[] = new FST.BytesReader[3]; + @SuppressWarnings("unchecked") + final FST.Arc suffixArcs[] = new FST.Arc[3]; + + /** * Generates a list of stems for the provided word * @@ -155,16 +181,33 @@ final class Stemmer { * this means inner most suffix must also contain circumfix flag. * @return List of stems, or empty list if no stems are found */ - private List stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix) { + private List stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix) throws IOException { // TODO: allow this stuff to be reused by tokenfilter List stems = new ArrayList<>(); if (doPrefix && dictionary.prefixes != null) { - for (int i = length - 1; i >= 0; i--) { - IntsRef prefixes = dictionary.lookupPrefix(word, 0, i); - if (prefixes == null) { + FST fst = dictionary.prefixes; + Outputs outputs = fst.outputs; + FST.BytesReader bytesReader = prefixReaders[recursionDepth]; + FST.Arc arc = prefixArcs[recursionDepth]; + fst.getFirstArc(arc); + IntsRef NO_OUTPUT = outputs.getNoOutput(); + IntsRef output = NO_OUTPUT; + for (int i = 0; i < length; i++) { + if (i > 0) { + int ch = word[i-1]; + if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) { + break; + } else if (arc.output != NO_OUTPUT) { + output = fst.outputs.add(output, arc.output); + } + } + IntsRef prefixes = null; + if (!arc.isFinal()) { continue; + } else { + prefixes = fst.outputs.add(output, arc.nextFinalOutput); } for (int j = 0; j < prefixes.length; j++) { @@ -218,10 +261,27 @@ final class Stemmer { } if (doSuffix && dictionary.suffixes != null) { - for (int i = 0; i < length; i++) { - IntsRef suffixes = dictionary.lookupSuffix(word, i, length - i); - if (suffixes == null) { + FST fst = dictionary.suffixes; + Outputs outputs = fst.outputs; + FST.BytesReader bytesReader = suffixReaders[recursionDepth]; + FST.Arc arc = suffixArcs[recursionDepth]; + fst.getFirstArc(arc); + IntsRef NO_OUTPUT = outputs.getNoOutput(); + IntsRef output = NO_OUTPUT; + for (int i = length; i >= 0; i--) { + if (i < length) { + int ch = word[i]; + if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) { + break; + } else if (arc.output != NO_OUTPUT) { + output = fst.outputs.add(output, arc.output); + } + } + IntsRef suffixes = null; + if (!arc.isFinal()) { continue; + } else { + suffixes = fst.outputs.add(output, arc.nextFinalOutput); } for (int j = 0; j < suffixes.length; j++) { @@ -313,7 +373,7 @@ final class Stemmer { * @param prefix true if we are removing a prefix (false if its a suffix) * @return List of stems for the word, or an empty list if none are found */ - List applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) { + List applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) throws IOException { // TODO: just pass this in from before, no need to decode it twice affixReader.setPosition(8 * affix); char flag = (char) (affixReader.readShort() & 0xffff);