LUCENE-5518: minor hunspell optimizations

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1576738 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-03-12 13:44:48 +00:00
parent 00718a2704
commit 0e2966b6c0
4 changed files with 121 additions and 46 deletions

View File

@ -27,6 +27,8 @@ import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.CharSequenceOutputs;
import org.apache.lucene.util.fst.FST;
@ -54,6 +56,7 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
@ -83,17 +86,16 @@ public class Dictionary {
private static final String UTF8_FLAG_TYPE = "UTF-8";
private static final String LONG_FLAG_TYPE = "long";
// TODO: really for suffixes we should reverse the automaton and run them backwards
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
FST<IntsRef> prefixes;
FST<IntsRef> suffixes;
// all Patterns used by prefixes and suffixes. these are typically re-used across
// all condition checks used by prefixes and suffixes. these are typically re-used across
// many affix stripping rules. so these are deduplicated, to save RAM.
// TODO: maybe don't use Pattern for the condition check...
// TODO: when we cut over Affix to FST, just store integer index to this.
ArrayList<Pattern> patterns = new ArrayList<>();
ArrayList<CharacterRunAutomaton> patterns = new ArrayList<>();
// the entries in the .dic file, mapping to their set of flags.
// the fst output is the ordinal list for flagLookup
@ -103,7 +105,8 @@ public class Dictionary {
BytesRefHash flagLookup = new BytesRefHash();
// the list of unique strip affixes.
BytesRefHash stripLookup = new BytesRefHash();
char[] stripData;
int[] stripOffsets;
// 8 bytes per affix
byte[] affixData = new byte[64];
@ -118,6 +121,7 @@ public class Dictionary {
boolean ignoreCase;
boolean complexPrefixes;
boolean twoStageAffix; // if no affixes have continuation classes, no need to do 2-level affix stripping
int circumfix = -1; // circumfix flag, or -1 if one is not defined
@ -160,7 +164,6 @@ public class Dictionary {
this.needsInputCleaning = ignoreCase;
this.needsOutputCleaning = false; // set if we have an OCONV
flagLookup.add(new BytesRef()); // no flags -> ord 0
stripLookup.add(new BytesRef()); // no strip -> ord 0
File aff = File.createTempFile("affix", "aff", tempDir);
OutputStream out = new BufferedOutputStream(new FileOutputStream(aff));
@ -272,6 +275,14 @@ public class Dictionary {
TreeMap<String, List<Character>> prefixes = new TreeMap<>();
TreeMap<String, List<Character>> suffixes = new TreeMap<>();
Map<String,Integer> seenPatterns = new HashMap<>();
// zero condition -> 0 ord
seenPatterns.put(".*", 0);
patterns.add(null);
// zero strip -> 0 ord
Map<String,Integer> seenStrips = new LinkedHashMap<>();
seenStrips.put("", 0);
LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
String line = null;
@ -283,9 +294,9 @@ public class Dictionary {
if (line.startsWith(ALIAS_KEY)) {
parseAlias(line);
} else if (line.startsWith(PREFIX_KEY)) {
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns);
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
} else if (line.startsWith(SUFFIX_KEY)) {
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns);
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
} else if (line.startsWith(FLAG_KEY)) {
// Assume that the FLAG line comes before any prefix or suffixes
// Store the strategy so it can be used when parsing the dic file
@ -326,6 +337,22 @@ public class Dictionary {
this.prefixes = affixFST(prefixes);
this.suffixes = affixFST(suffixes);
int totalChars = 0;
for (String strip : seenStrips.keySet()) {
totalChars += strip.length();
}
stripData = new char[totalChars];
stripOffsets = new int[seenStrips.size()+1];
int currentOffset = 0;
int currentIndex = 0;
for (String strip : seenStrips.keySet()) {
stripOffsets[currentIndex++] = currentOffset;
strip.getChars(0, strip.length(), stripData, currentOffset);
currentOffset += strip.length();
}
assert currentIndex == seenStrips.size();
stripOffsets[currentIndex] = currentOffset;
}
private FST<IntsRef> affixFST(TreeMap<String,List<Character>> affixes) throws IOException {
@ -360,7 +387,8 @@ public class Dictionary {
String header,
LineNumberReader reader,
String conditionPattern,
Map<String,Integer> seenPatterns) throws IOException, ParseException {
Map<String,Integer> seenPatterns,
Map<String,Integer> seenStrips) throws IOException, ParseException {
BytesRef scratch = new BytesRef();
StringBuilder sb = new StringBuilder();
@ -399,7 +427,10 @@ public class Dictionary {
appendFlags = flagParsingStrategy.parseFlags(flagPart);
Arrays.sort(appendFlags);
twoStageAffix = true;
}
// TODO: add test and fix zero-affix handling!
String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
// at least the gascon affix file has this issue
@ -411,7 +442,16 @@ public class Dictionary {
condition = condition.replace("-", "\\-");
}
String regex = String.format(Locale.ROOT, conditionPattern, condition);
final String regex;
if (".".equals(condition)) {
regex = ".*"; // Zero condition is indicated by dot
} else if (condition.equals(strip)) {
regex = ".*"; // TODO: optimize this better:
// if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
// but this is complicated...
} else {
regex = String.format(Locale.ROOT, conditionPattern, condition);
}
// deduplicate patterns
Integer patternIndex = seenPatterns.get(regex);
@ -421,17 +461,17 @@ public class Dictionary {
throw new UnsupportedOperationException("Too many patterns, please report this to dev@lucene.apache.org");
}
seenPatterns.put(regex, patternIndex);
Pattern pattern = Pattern.compile(regex);
CharacterRunAutomaton pattern = new CharacterRunAutomaton(new RegExp(regex, RegExp.NONE).toAutomaton());
patterns.add(pattern);
}
scratch.copyChars(strip);
int stripOrd = stripLookup.add(scratch);
if (stripOrd < 0) {
// already exists in our hash
stripOrd = (-stripOrd)-1;
} else if (stripOrd > Character.MAX_VALUE) {
throw new UnsupportedOperationException("Too many unique strips, please report this to dev@lucene.apache.org");
Integer stripOrd = seenStrips.get(strip);
if (stripOrd == null) {
stripOrd = seenStrips.size();
seenStrips.put(strip, stripOrd);
if (stripOrd > Character.MAX_VALUE) {
throw new UnsupportedOperationException("Too many unique strips, please report this to dev@lucene.apache.org");
}
}
if (appendFlags == null) {
@ -449,7 +489,7 @@ public class Dictionary {
}
affixWriter.writeShort((short)flag);
affixWriter.writeShort((short)stripOrd);
affixWriter.writeShort((short)stripOrd.intValue());
// encode crossProduct into patternIndex
int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
affixWriter.writeShort((short)patternOrd);
@ -765,6 +805,9 @@ public class Dictionary {
}
static char[] decodeFlags(BytesRef b) {
if (b.length == 0) {
return CharsRef.EMPTY_CHARS;
}
int len = b.length >>> 1;
char flags[] = new char[len];
int upto = 0;

View File

@ -22,7 +22,6 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.store.ByteArrayDataInput;
@ -31,6 +30,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
* Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word. It
@ -160,7 +160,7 @@ final class Stemmer {
// TODO: allow this stuff to be reused by tokenfilter
List<CharsRef> stems = new ArrayList<CharsRef>();
if (doPrefix) {
if (doPrefix && dictionary.prefixes != null) {
for (int i = length - 1; i >= 0; i--) {
IntsRef prefixes = dictionary.lookupPrefix(word, 0, i);
if (prefixes == null) {
@ -197,12 +197,19 @@ final class Stemmer {
int deAffixedStart = i;
int deAffixedLength = length - deAffixedStart;
dictionary.stripLookup.get(stripOrd, scratch);
String strippedWord = new StringBuilder().append(scratch.utf8ToString())
.append(word, deAffixedStart, deAffixedLength)
.toString();
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd+1];
int stripLength = stripEnd - stripStart;
List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, -1, recursionDepth, true, circumfix);
if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) {
continue;
}
char strippedWord[] = new char[stripLength + deAffixedLength];
System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);
List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix);
stems.addAll(stemList);
}
@ -210,7 +217,7 @@ final class Stemmer {
}
}
if (doSuffix) {
if (doSuffix && dictionary.suffixes != null) {
for (int i = 0; i < length; i++) {
IntsRef suffixes = dictionary.lookupSuffix(word, i, length - i);
if (suffixes == null) {
@ -246,11 +253,20 @@ final class Stemmer {
if (compatible) {
int appendLength = length - i;
int deAffixedLength = length - appendLength;
// TODO: can we do this in-place?
dictionary.stripLookup.get(stripOrd, scratch);
String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString();
List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, prefixFlag, recursionDepth, false, circumfix);
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd+1];
int stripLength = stripEnd - stripStart;
if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) {
continue;
}
char strippedWord[] = new char[stripLength + deAffixedLength];
System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix);
stems.addAll(stemList);
}
@ -260,6 +276,30 @@ final class Stemmer {
return stems;
}
/** checks condition of the concatenation of two strings */
// note: this is pretty stupid, we really should subtract strip from the condition up front and just check the stem
// but this is a little bit more complicated.
private boolean checkCondition(int condition, char c1[], int c1off, int c1len, char c2[], int c2off, int c2len) {
if (condition != 0) {
CharacterRunAutomaton pattern = dictionary.patterns.get(condition);
int state = pattern.getInitialState();
for (int i = c1off; i < c1off + c1len; i++) {
state = pattern.step(state, c1[i]);
if (state == -1) {
return false;
}
}
for (int i = c2off; i < c2off + c2len; i++) {
state = pattern.step(state, c2[i]);
if (state == -1) {
return false;
}
}
return pattern.isAccept(state);
}
return true;
}
/**
* Applies the affix rule to the given word, producing a list of stems if any are found
@ -273,10 +313,7 @@ final class Stemmer {
* @param prefix true if we are removing a prefix (false if its a suffix)
* @return List of stems for the word, or an empty list if none are found
*/
List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) {
segment.setLength(0);
segment.append(strippedWord, 0, length);
List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) {
// TODO: just pass this in from before, no need to decode it twice
affixReader.setPosition(8 * affix);
char flag = (char) (affixReader.readShort() & 0xffff);
@ -285,11 +322,6 @@ final class Stemmer {
boolean crossProduct = (condition & 1) == 1;
condition >>>= 1;
char append = (char) (affixReader.readShort() & 0xffff);
Pattern pattern = dictionary.patterns.get(condition);
if (!pattern.matcher(segment).matches()) {
return Collections.emptyList();
}
List<CharsRef> stems = new ArrayList<CharsRef>();
@ -338,9 +370,9 @@ final class Stemmer {
if (prefix) {
// we took away the first prefix.
// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
// COMPLEXPREFIXES = false: combine with another suffix
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes, true, true, circumfix));
} else if (!dictionary.complexPrefixes) {
// COMPLEXPREFIXES = false: combine with a suffix
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix));
} else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
// we took away a suffix.
// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
// COMPLEXPREFIXES = false: combine with another suffix
@ -350,7 +382,7 @@ final class Stemmer {
if (prefix && dictionary.complexPrefixes) {
// we took away the second prefix: go look for another suffix
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix));
} else if (prefix == false && dictionary.complexPrefixes == false) {
} else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
// we took away a prefix, then a suffix: go look for another suffix
stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix));
}

View File

@ -169,7 +169,7 @@ public class TestAllDictionaries extends LuceneTestCase {
System.out.println(tests[i] + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
"words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
"flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
"strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " +
"strips=" + RamUsageEstimator.humanSizeOf(dic.stripData) + ", " +
"conditions=" + RamUsageEstimator.humanSizeOf(dic.patterns) + ", " +
"affixData=" + RamUsageEstimator.humanSizeOf(dic.affixData) + ", " +
"prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +

View File

@ -185,7 +185,7 @@ public class TestAllDictionaries2 extends LuceneTestCase {
System.out.println(tests[i] + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
"words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
"flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
"strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " +
"strips=" + RamUsageEstimator.humanSizeOf(dic.stripData) + ", " +
"conditions=" + RamUsageEstimator.humanSizeOf(dic.patterns) + ", " +
"affixData=" + RamUsageEstimator.humanSizeOf(dic.affixData) + ", " +
"prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +