mirror of https://github.com/apache/lucene.git
LUCENE-5518: minor hunspell optimizations
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1576738 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
00718a2704
commit
0e2966b6c0
|
@ -27,6 +27,8 @@ import org.apache.lucene.util.IntsRef;
|
|||
import org.apache.lucene.util.OfflineSorter;
|
||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
|
||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.CharSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
|
@ -54,6 +56,7 @@ import java.util.Arrays;
|
|||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
@ -83,17 +86,16 @@ public class Dictionary {
|
|||
private static final String UTF8_FLAG_TYPE = "UTF-8";
|
||||
private static final String LONG_FLAG_TYPE = "long";
|
||||
|
||||
// TODO: really for suffixes we should reverse the automaton and run them backwards
|
||||
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
|
||||
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
|
||||
|
||||
FST<IntsRef> prefixes;
|
||||
FST<IntsRef> suffixes;
|
||||
|
||||
// all Patterns used by prefixes and suffixes. these are typically re-used across
|
||||
// all condition checks used by prefixes and suffixes. these are typically re-used across
|
||||
// many affix stripping rules. so these are deduplicated, to save RAM.
|
||||
// TODO: maybe don't use Pattern for the condition check...
|
||||
// TODO: when we cut over Affix to FST, just store integer index to this.
|
||||
ArrayList<Pattern> patterns = new ArrayList<>();
|
||||
ArrayList<CharacterRunAutomaton> patterns = new ArrayList<>();
|
||||
|
||||
// the entries in the .dic file, mapping to their set of flags.
|
||||
// the fst output is the ordinal list for flagLookup
|
||||
|
@ -103,7 +105,8 @@ public class Dictionary {
|
|||
BytesRefHash flagLookup = new BytesRefHash();
|
||||
|
||||
// the list of unique strip affixes.
|
||||
BytesRefHash stripLookup = new BytesRefHash();
|
||||
char[] stripData;
|
||||
int[] stripOffsets;
|
||||
|
||||
// 8 bytes per affix
|
||||
byte[] affixData = new byte[64];
|
||||
|
@ -118,6 +121,7 @@ public class Dictionary {
|
|||
|
||||
boolean ignoreCase;
|
||||
boolean complexPrefixes;
|
||||
boolean twoStageAffix; // if no affixes have continuation classes, no need to do 2-level affix stripping
|
||||
|
||||
int circumfix = -1; // circumfix flag, or -1 if one is not defined
|
||||
|
||||
|
@ -160,7 +164,6 @@ public class Dictionary {
|
|||
this.needsInputCleaning = ignoreCase;
|
||||
this.needsOutputCleaning = false; // set if we have an OCONV
|
||||
flagLookup.add(new BytesRef()); // no flags -> ord 0
|
||||
stripLookup.add(new BytesRef()); // no strip -> ord 0
|
||||
|
||||
File aff = File.createTempFile("affix", "aff", tempDir);
|
||||
OutputStream out = new BufferedOutputStream(new FileOutputStream(aff));
|
||||
|
@ -272,6 +275,14 @@ public class Dictionary {
|
|||
TreeMap<String, List<Character>> prefixes = new TreeMap<>();
|
||||
TreeMap<String, List<Character>> suffixes = new TreeMap<>();
|
||||
Map<String,Integer> seenPatterns = new HashMap<>();
|
||||
|
||||
// zero condition -> 0 ord
|
||||
seenPatterns.put(".*", 0);
|
||||
patterns.add(null);
|
||||
|
||||
// zero strip -> 0 ord
|
||||
Map<String,Integer> seenStrips = new LinkedHashMap<>();
|
||||
seenStrips.put("", 0);
|
||||
|
||||
LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
|
||||
String line = null;
|
||||
|
@ -283,9 +294,9 @@ public class Dictionary {
|
|||
if (line.startsWith(ALIAS_KEY)) {
|
||||
parseAlias(line);
|
||||
} else if (line.startsWith(PREFIX_KEY)) {
|
||||
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns);
|
||||
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
|
||||
} else if (line.startsWith(SUFFIX_KEY)) {
|
||||
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns);
|
||||
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
|
||||
} else if (line.startsWith(FLAG_KEY)) {
|
||||
// Assume that the FLAG line comes before any prefix or suffixes
|
||||
// Store the strategy so it can be used when parsing the dic file
|
||||
|
@ -326,6 +337,22 @@ public class Dictionary {
|
|||
|
||||
this.prefixes = affixFST(prefixes);
|
||||
this.suffixes = affixFST(suffixes);
|
||||
|
||||
int totalChars = 0;
|
||||
for (String strip : seenStrips.keySet()) {
|
||||
totalChars += strip.length();
|
||||
}
|
||||
stripData = new char[totalChars];
|
||||
stripOffsets = new int[seenStrips.size()+1];
|
||||
int currentOffset = 0;
|
||||
int currentIndex = 0;
|
||||
for (String strip : seenStrips.keySet()) {
|
||||
stripOffsets[currentIndex++] = currentOffset;
|
||||
strip.getChars(0, strip.length(), stripData, currentOffset);
|
||||
currentOffset += strip.length();
|
||||
}
|
||||
assert currentIndex == seenStrips.size();
|
||||
stripOffsets[currentIndex] = currentOffset;
|
||||
}
|
||||
|
||||
private FST<IntsRef> affixFST(TreeMap<String,List<Character>> affixes) throws IOException {
|
||||
|
@ -360,7 +387,8 @@ public class Dictionary {
|
|||
String header,
|
||||
LineNumberReader reader,
|
||||
String conditionPattern,
|
||||
Map<String,Integer> seenPatterns) throws IOException, ParseException {
|
||||
Map<String,Integer> seenPatterns,
|
||||
Map<String,Integer> seenStrips) throws IOException, ParseException {
|
||||
|
||||
BytesRef scratch = new BytesRef();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
@ -399,7 +427,10 @@ public class Dictionary {
|
|||
|
||||
appendFlags = flagParsingStrategy.parseFlags(flagPart);
|
||||
Arrays.sort(appendFlags);
|
||||
twoStageAffix = true;
|
||||
}
|
||||
|
||||
// TODO: add test and fix zero-affix handling!
|
||||
|
||||
String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
|
||||
// at least the gascon affix file has this issue
|
||||
|
@ -411,7 +442,16 @@ public class Dictionary {
|
|||
condition = condition.replace("-", "\\-");
|
||||
}
|
||||
|
||||
String regex = String.format(Locale.ROOT, conditionPattern, condition);
|
||||
final String regex;
|
||||
if (".".equals(condition)) {
|
||||
regex = ".*"; // Zero condition is indicated by dot
|
||||
} else if (condition.equals(strip)) {
|
||||
regex = ".*"; // TODO: optimize this better:
|
||||
// if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
|
||||
// but this is complicated...
|
||||
} else {
|
||||
regex = String.format(Locale.ROOT, conditionPattern, condition);
|
||||
}
|
||||
|
||||
// deduplicate patterns
|
||||
Integer patternIndex = seenPatterns.get(regex);
|
||||
|
@ -421,17 +461,17 @@ public class Dictionary {
|
|||
throw new UnsupportedOperationException("Too many patterns, please report this to dev@lucene.apache.org");
|
||||
}
|
||||
seenPatterns.put(regex, patternIndex);
|
||||
Pattern pattern = Pattern.compile(regex);
|
||||
CharacterRunAutomaton pattern = new CharacterRunAutomaton(new RegExp(regex, RegExp.NONE).toAutomaton());
|
||||
patterns.add(pattern);
|
||||
}
|
||||
|
||||
scratch.copyChars(strip);
|
||||
int stripOrd = stripLookup.add(scratch);
|
||||
if (stripOrd < 0) {
|
||||
// already exists in our hash
|
||||
stripOrd = (-stripOrd)-1;
|
||||
} else if (stripOrd > Character.MAX_VALUE) {
|
||||
throw new UnsupportedOperationException("Too many unique strips, please report this to dev@lucene.apache.org");
|
||||
Integer stripOrd = seenStrips.get(strip);
|
||||
if (stripOrd == null) {
|
||||
stripOrd = seenStrips.size();
|
||||
seenStrips.put(strip, stripOrd);
|
||||
if (stripOrd > Character.MAX_VALUE) {
|
||||
throw new UnsupportedOperationException("Too many unique strips, please report this to dev@lucene.apache.org");
|
||||
}
|
||||
}
|
||||
|
||||
if (appendFlags == null) {
|
||||
|
@ -449,7 +489,7 @@ public class Dictionary {
|
|||
}
|
||||
|
||||
affixWriter.writeShort((short)flag);
|
||||
affixWriter.writeShort((short)stripOrd);
|
||||
affixWriter.writeShort((short)stripOrd.intValue());
|
||||
// encode crossProduct into patternIndex
|
||||
int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
|
||||
affixWriter.writeShort((short)patternOrd);
|
||||
|
@ -765,6 +805,9 @@ public class Dictionary {
|
|||
}
|
||||
|
||||
static char[] decodeFlags(BytesRef b) {
|
||||
if (b.length == 0) {
|
||||
return CharsRef.EMPTY_CHARS;
|
||||
}
|
||||
int len = b.length >>> 1;
|
||||
char flags[] = new char[len];
|
||||
int upto = 0;
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.util.ArrayList;
|
|||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
|
@ -31,6 +30,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
|
||||
/**
|
||||
* Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word. It
|
||||
|
@ -160,7 +160,7 @@ final class Stemmer {
|
|||
// TODO: allow this stuff to be reused by tokenfilter
|
||||
List<CharsRef> stems = new ArrayList<CharsRef>();
|
||||
|
||||
if (doPrefix) {
|
||||
if (doPrefix && dictionary.prefixes != null) {
|
||||
for (int i = length - 1; i >= 0; i--) {
|
||||
IntsRef prefixes = dictionary.lookupPrefix(word, 0, i);
|
||||
if (prefixes == null) {
|
||||
|
@ -197,12 +197,19 @@ final class Stemmer {
|
|||
int deAffixedStart = i;
|
||||
int deAffixedLength = length - deAffixedStart;
|
||||
|
||||
dictionary.stripLookup.get(stripOrd, scratch);
|
||||
String strippedWord = new StringBuilder().append(scratch.utf8ToString())
|
||||
.append(word, deAffixedStart, deAffixedLength)
|
||||
.toString();
|
||||
int stripStart = dictionary.stripOffsets[stripOrd];
|
||||
int stripEnd = dictionary.stripOffsets[stripOrd+1];
|
||||
int stripLength = stripEnd - stripStart;
|
||||
|
||||
List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, -1, recursionDepth, true, circumfix);
|
||||
if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
char strippedWord[] = new char[stripLength + deAffixedLength];
|
||||
System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
|
||||
System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);
|
||||
|
||||
List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix);
|
||||
|
||||
stems.addAll(stemList);
|
||||
}
|
||||
|
@ -210,7 +217,7 @@ final class Stemmer {
|
|||
}
|
||||
}
|
||||
|
||||
if (doSuffix) {
|
||||
if (doSuffix && dictionary.suffixes != null) {
|
||||
for (int i = 0; i < length; i++) {
|
||||
IntsRef suffixes = dictionary.lookupSuffix(word, i, length - i);
|
||||
if (suffixes == null) {
|
||||
|
@ -246,11 +253,20 @@ final class Stemmer {
|
|||
if (compatible) {
|
||||
int appendLength = length - i;
|
||||
int deAffixedLength = length - appendLength;
|
||||
// TODO: can we do this in-place?
|
||||
dictionary.stripLookup.get(stripOrd, scratch);
|
||||
String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString();
|
||||
|
||||
List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, prefixFlag, recursionDepth, false, circumfix);
|
||||
int stripStart = dictionary.stripOffsets[stripOrd];
|
||||
int stripEnd = dictionary.stripOffsets[stripOrd+1];
|
||||
int stripLength = stripEnd - stripStart;
|
||||
|
||||
if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
char strippedWord[] = new char[stripLength + deAffixedLength];
|
||||
System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
|
||||
System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
|
||||
|
||||
List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix);
|
||||
|
||||
stems.addAll(stemList);
|
||||
}
|
||||
|
@ -260,6 +276,30 @@ final class Stemmer {
|
|||
|
||||
return stems;
|
||||
}
|
||||
|
||||
/** checks condition of the concatenation of two strings */
|
||||
// note: this is pretty stupid, we really should subtract strip from the condition up front and just check the stem
|
||||
// but this is a little bit more complicated.
|
||||
private boolean checkCondition(int condition, char c1[], int c1off, int c1len, char c2[], int c2off, int c2len) {
|
||||
if (condition != 0) {
|
||||
CharacterRunAutomaton pattern = dictionary.patterns.get(condition);
|
||||
int state = pattern.getInitialState();
|
||||
for (int i = c1off; i < c1off + c1len; i++) {
|
||||
state = pattern.step(state, c1[i]);
|
||||
if (state == -1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
for (int i = c2off; i < c2off + c2len; i++) {
|
||||
state = pattern.step(state, c2[i]);
|
||||
if (state == -1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return pattern.isAccept(state);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies the affix rule to the given word, producing a list of stems if any are found
|
||||
|
@ -273,10 +313,7 @@ final class Stemmer {
|
|||
* @param prefix true if we are removing a prefix (false if its a suffix)
|
||||
* @return List of stems for the word, or an empty list if none are found
|
||||
*/
|
||||
List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) {
|
||||
segment.setLength(0);
|
||||
segment.append(strippedWord, 0, length);
|
||||
|
||||
List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) {
|
||||
// TODO: just pass this in from before, no need to decode it twice
|
||||
affixReader.setPosition(8 * affix);
|
||||
char flag = (char) (affixReader.readShort() & 0xffff);
|
||||
|
@ -285,11 +322,6 @@ final class Stemmer {
|
|||
boolean crossProduct = (condition & 1) == 1;
|
||||
condition >>>= 1;
|
||||
char append = (char) (affixReader.readShort() & 0xffff);
|
||||
|
||||
Pattern pattern = dictionary.patterns.get(condition);
|
||||
if (!pattern.matcher(segment).matches()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
List<CharsRef> stems = new ArrayList<CharsRef>();
|
||||
|
||||
|
@ -338,9 +370,9 @@ final class Stemmer {
|
|||
if (prefix) {
|
||||
// we took away the first prefix.
|
||||
// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
|
||||
// COMPLEXPREFIXES = false: combine with another suffix
|
||||
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes, true, true, circumfix));
|
||||
} else if (!dictionary.complexPrefixes) {
|
||||
// COMPLEXPREFIXES = false: combine with a suffix
|
||||
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix));
|
||||
} else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
|
||||
// we took away a suffix.
|
||||
// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
|
||||
// COMPLEXPREFIXES = false: combine with another suffix
|
||||
|
@ -350,7 +382,7 @@ final class Stemmer {
|
|||
if (prefix && dictionary.complexPrefixes) {
|
||||
// we took away the second prefix: go look for another suffix
|
||||
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix));
|
||||
} else if (prefix == false && dictionary.complexPrefixes == false) {
|
||||
} else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
|
||||
// we took away a prefix, then a suffix: go look for another suffix
|
||||
stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix));
|
||||
}
|
||||
|
|
|
@ -169,7 +169,7 @@ public class TestAllDictionaries extends LuceneTestCase {
|
|||
System.out.println(tests[i] + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
|
||||
"words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
|
||||
"flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
|
||||
"strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " +
|
||||
"strips=" + RamUsageEstimator.humanSizeOf(dic.stripData) + ", " +
|
||||
"conditions=" + RamUsageEstimator.humanSizeOf(dic.patterns) + ", " +
|
||||
"affixData=" + RamUsageEstimator.humanSizeOf(dic.affixData) + ", " +
|
||||
"prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +
|
||||
|
|
|
@ -185,7 +185,7 @@ public class TestAllDictionaries2 extends LuceneTestCase {
|
|||
System.out.println(tests[i] + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
|
||||
"words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
|
||||
"flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
|
||||
"strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " +
|
||||
"strips=" + RamUsageEstimator.humanSizeOf(dic.stripData) + ", " +
|
||||
"conditions=" + RamUsageEstimator.humanSizeOf(dic.patterns) + ", " +
|
||||
"affixData=" + RamUsageEstimator.humanSizeOf(dic.affixData) + ", " +
|
||||
"prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +
|
||||
|
|
Loading…
Reference in New Issue