mirror of https://github.com/apache/lucene.git
LUCENE-5603: fix hunspell to use FST efficiently
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1587162 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3800409581
commit
4aa0a1db1c
|
@ -84,6 +84,11 @@ API Changes
|
|||
IndexOutput.getFilePointer instead) and IndexOutput.setLength.
|
||||
(Mike McCandless)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-5603: hunspell stemmer more efficiently strips prefixes
|
||||
and suffixes. (Robert Muir)
|
||||
|
||||
======================= Lucene 4.8.0 =======================
|
||||
|
||||
System Requirements
|
||||
|
|
|
@ -207,32 +207,16 @@ public class Dictionary {
|
|||
return lookup(words, word, offset, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length
|
||||
*
|
||||
* @param word Char array to generate the String from
|
||||
* @param offset Offset in the char array that the String starts at
|
||||
* @param length Length from the offset that the String is
|
||||
* @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
|
||||
*/
|
||||
// only for testing
|
||||
IntsRef lookupPrefix(char word[], int offset, int length) {
|
||||
return lookup(prefixes, word, offset, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length
|
||||
*
|
||||
* @param word Char array to generate the String from
|
||||
* @param offset Offset in the char array that the String starts at
|
||||
* @param length Length from the offset that the String is
|
||||
* @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
|
||||
*/
|
||||
// only for testing
|
||||
IntsRef lookupSuffix(char word[], int offset, int length) {
|
||||
return lookup(suffixes, word, offset, length);
|
||||
}
|
||||
|
||||
// TODO: this is pretty stupid, considering how the stemming algorithm works
|
||||
// we can speed it up to be significantly faster!
|
||||
IntsRef lookup(FST<IntsRef> fst, char word[], int offset, int length) {
|
||||
if (fst == null) {
|
||||
return null;
|
||||
|
@ -396,6 +380,7 @@ public class Dictionary {
|
|||
String args[] = header.split("\\s+");
|
||||
|
||||
boolean crossProduct = args[2].equals("Y");
|
||||
boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;
|
||||
|
||||
int numLines = Integer.parseInt(args[3]);
|
||||
affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
|
||||
|
@ -501,6 +486,10 @@ public class Dictionary {
|
|||
affixArg = cleaned.toString();
|
||||
}
|
||||
|
||||
if (isSuffix) {
|
||||
affixArg = new StringBuilder(affixArg).reverse().toString();
|
||||
}
|
||||
|
||||
List<Character> list = affixes.get(affixArg);
|
||||
if (list == null) {
|
||||
list = new ArrayList<>();
|
||||
|
|
|
@ -31,6 +31,8 @@ import org.apache.lucene.util.CharsRef;
|
|||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.Outputs;
|
||||
|
||||
/**
|
||||
* Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word. It
|
||||
|
@ -54,6 +56,16 @@ final class Stemmer {
|
|||
public Stemmer(Dictionary dictionary) {
|
||||
this.dictionary = dictionary;
|
||||
this.affixReader = new ByteArrayDataInput(dictionary.affixData);
|
||||
for (int level = 0; level < 3; level++) {
|
||||
if (dictionary.prefixes != null) {
|
||||
prefixArcs[level] = new FST.Arc<>();
|
||||
prefixReaders[level] = dictionary.prefixes.getBytesReader();
|
||||
}
|
||||
if (dictionary.suffixes != null) {
|
||||
suffixArcs[level] = new FST.Arc<>();
|
||||
suffixReaders[level] = dictionary.suffixes.getBytesReader();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -93,7 +105,11 @@ final class Stemmer {
|
|||
stems.add(newStem(word, length));
|
||||
}
|
||||
}
|
||||
stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false));
|
||||
try {
|
||||
stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false));
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
return stems;
|
||||
}
|
||||
|
||||
|
@ -138,6 +154,16 @@ final class Stemmer {
|
|||
|
||||
// ================================================= Helper Methods ================================================
|
||||
|
||||
// some state for traversing FSTs
|
||||
final FST.BytesReader prefixReaders[] = new FST.BytesReader[3];
|
||||
@SuppressWarnings("unchecked")
|
||||
final FST.Arc<IntsRef> prefixArcs[] = new FST.Arc[3];
|
||||
|
||||
final FST.BytesReader suffixReaders[] = new FST.BytesReader[3];
|
||||
@SuppressWarnings("unchecked")
|
||||
final FST.Arc<IntsRef> suffixArcs[] = new FST.Arc[3];
|
||||
|
||||
|
||||
/**
|
||||
* Generates a list of stems for the provided word
|
||||
*
|
||||
|
@ -155,16 +181,33 @@ final class Stemmer {
|
|||
* this means inner most suffix must also contain circumfix flag.
|
||||
* @return List of stems, or empty list if no stems are found
|
||||
*/
|
||||
private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix) {
|
||||
private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix) throws IOException {
|
||||
|
||||
// TODO: allow this stuff to be reused by tokenfilter
|
||||
List<CharsRef> stems = new ArrayList<>();
|
||||
|
||||
if (doPrefix && dictionary.prefixes != null) {
|
||||
for (int i = length - 1; i >= 0; i--) {
|
||||
IntsRef prefixes = dictionary.lookupPrefix(word, 0, i);
|
||||
if (prefixes == null) {
|
||||
FST<IntsRef> fst = dictionary.prefixes;
|
||||
Outputs<IntsRef> outputs = fst.outputs;
|
||||
FST.BytesReader bytesReader = prefixReaders[recursionDepth];
|
||||
FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
|
||||
fst.getFirstArc(arc);
|
||||
IntsRef NO_OUTPUT = outputs.getNoOutput();
|
||||
IntsRef output = NO_OUTPUT;
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (i > 0) {
|
||||
int ch = word[i-1];
|
||||
if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
|
||||
break;
|
||||
} else if (arc.output != NO_OUTPUT) {
|
||||
output = fst.outputs.add(output, arc.output);
|
||||
}
|
||||
}
|
||||
IntsRef prefixes = null;
|
||||
if (!arc.isFinal()) {
|
||||
continue;
|
||||
} else {
|
||||
prefixes = fst.outputs.add(output, arc.nextFinalOutput);
|
||||
}
|
||||
|
||||
for (int j = 0; j < prefixes.length; j++) {
|
||||
|
@ -218,10 +261,27 @@ final class Stemmer {
|
|||
}
|
||||
|
||||
if (doSuffix && dictionary.suffixes != null) {
|
||||
for (int i = 0; i < length; i++) {
|
||||
IntsRef suffixes = dictionary.lookupSuffix(word, i, length - i);
|
||||
if (suffixes == null) {
|
||||
FST<IntsRef> fst = dictionary.suffixes;
|
||||
Outputs<IntsRef> outputs = fst.outputs;
|
||||
FST.BytesReader bytesReader = suffixReaders[recursionDepth];
|
||||
FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
|
||||
fst.getFirstArc(arc);
|
||||
IntsRef NO_OUTPUT = outputs.getNoOutput();
|
||||
IntsRef output = NO_OUTPUT;
|
||||
for (int i = length; i >= 0; i--) {
|
||||
if (i < length) {
|
||||
int ch = word[i];
|
||||
if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
|
||||
break;
|
||||
} else if (arc.output != NO_OUTPUT) {
|
||||
output = fst.outputs.add(output, arc.output);
|
||||
}
|
||||
}
|
||||
IntsRef suffixes = null;
|
||||
if (!arc.isFinal()) {
|
||||
continue;
|
||||
} else {
|
||||
suffixes = fst.outputs.add(output, arc.nextFinalOutput);
|
||||
}
|
||||
|
||||
for (int j = 0; j < suffixes.length; j++) {
|
||||
|
@ -313,7 +373,7 @@ final class Stemmer {
|
|||
* @param prefix true if we are removing a prefix (false if its a suffix)
|
||||
* @return List of stems for the word, or an empty list if none are found
|
||||
*/
|
||||
List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) {
|
||||
List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) throws IOException {
|
||||
// TODO: just pass this in from before, no need to decode it twice
|
||||
affixReader.setPosition(8 * affix);
|
||||
char flag = (char) (affixReader.readShort() & 0xffff);
|
||||
|
|
Loading…
Reference in New Issue