LUCENE-5468: convert affixes to FST

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572666 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-02-27 17:53:30 +00:00
parent cdec14902b
commit b2b86dd8ad
4 changed files with 83 additions and 19 deletions

View File

@ -31,7 +31,9 @@ import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.IntSequenceOutputs;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import java.io.*;
import java.nio.charset.Charset;
@ -46,6 +48,7 @@ import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Pattern;
/**
@ -68,8 +71,8 @@ public class Dictionary {
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
public CharArrayMap<List<Character>> prefixes;
public CharArrayMap<List<Character>> suffixes;
public FST<IntsRef> prefixes;
public FST<IntsRef> suffixes;
// all Patterns used by prefixes and suffixes. these are typically re-used across
// many affix stripping rules. so these are deduplicated, to save RAM.
@ -137,7 +140,7 @@ public class Dictionary {
ord = lookupOrd(word, offset, length);
} catch (IOException ex) { /* bogus */ }
if (ord == null) {
return null;
return null;
}
return decodeFlags(flagLookup.get(ord, scratch));
}
@ -175,8 +178,8 @@ public class Dictionary {
* @param length Length from the offset that the String is
* @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
*/
public List<Character> lookupPrefix(char word[], int offset, int length) {
return prefixes.get(word, offset, length);
IntsRef lookupPrefix(char word[], int offset, int length) {
return lookupAffix(prefixes, word, offset, length);
}
/**
@ -187,8 +190,42 @@ public class Dictionary {
* @param length Length from the offset that the String is
* @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
*/
List<Character> lookupSuffix(char word[], int offset, int length) {
return suffixes.get(word, offset, length);
IntsRef lookupSuffix(char word[], int offset, int length) {
return lookupAffix(suffixes, word, offset, length);
}
// TODO: this is pretty stupid, considering how the stemming algorithm works
// we can speed it up to be significantly faster!
IntsRef lookupAffix(FST<IntsRef> fst, char word[], int offset, int length) {
if (fst == null) {
return null;
}
final FST.BytesReader bytesReader = fst.getBytesReader();
final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>());
// Accumulate output as we go
final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
IntsRef output = NO_OUTPUT;
int l = offset + length;
try {
for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
cp = Character.codePointAt(word, i, l);
if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
return null;
} else if (arc.output != NO_OUTPUT) {
output = fst.outputs.add(output, arc.output);
}
}
if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
return null;
} else if (arc.output != NO_OUTPUT) {
return fst.outputs.add(output, arc.output);
} else {
return output;
}
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
/**
@ -199,8 +236,8 @@ public class Dictionary {
* @throws IOException Can be thrown while reading from the InputStream
*/
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
prefixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
suffixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
TreeMap<String, List<Character>> prefixes = new TreeMap<>();
TreeMap<String, List<Character>> suffixes = new TreeMap<>();
Map<String,Integer> seenPatterns = new HashMap<>();
LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
@ -218,6 +255,27 @@ public class Dictionary {
flagParsingStrategy = getFlagParsingStrategy(line);
}
}
this.prefixes = affixFST(prefixes);
this.suffixes = affixFST(suffixes);
}
private FST<IntsRef> affixFST(TreeMap<String,List<Character>> affixes) throws IOException {
IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
IntsRef scratch = new IntsRef();
for (Map.Entry<String,List<Character>> entry : affixes.entrySet()) {
Util.toUTF32(entry.getKey(), scratch);
List<Character> entries = entry.getValue();
IntsRef output = new IntsRef(entries.size());
int upto = 0;
for (Character c : entries) {
output.ints[output.length++] = c;
}
builder.add(scratch, output);
}
return builder.finish();
}
/**
@ -231,7 +289,7 @@ public class Dictionary {
* @param seenPatterns map from condition -> index of patterns, for deduplication.
* @throws IOException Can be thrown while reading the rule
*/
private void parseAffix(CharArrayMap<List<Character>> affixes,
private void parseAffix(TreeMap<String,List<Character>> affixes,
String header,
LineNumberReader reader,
String conditionPattern,

View File

@ -27,6 +27,7 @@ import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.Version;
/**
@ -125,12 +126,13 @@ final class Stemmer {
List<CharsRef> stems = new ArrayList<CharsRef>();
for (int i = 0; i < length; i++) {
List<Character> suffixes = dictionary.lookupSuffix(word, i, length - i);
IntsRef suffixes = dictionary.lookupSuffix(word, i, length - i);
if (suffixes == null) {
continue;
}
for (Character suffix : suffixes) {
for (int j = 0; j < suffixes.length; j++) {
int suffix = suffixes.ints[suffixes.offset + j];
affixReader.setPosition(8 * suffix);
char flag = (char) (affixReader.readShort() & 0xffff);
if (hasCrossCheckedFlag(flag, flags)) {
@ -149,12 +151,13 @@ final class Stemmer {
}
for (int i = length - 1; i >= 0; i--) {
List<Character> prefixes = dictionary.lookupPrefix(word, 0, i);
IntsRef prefixes = dictionary.lookupPrefix(word, 0, i);
if (prefixes == null) {
continue;
}
for (Character prefix : prefixes) {
for (int j = 0; j < prefixes.length; j++) {
int prefix = prefixes.ints[prefixes.offset + j];
affixReader.setPosition(8 * prefix);
char flag = (char) (affixReader.readShort() & 0xffff);
if (hasCrossCheckedFlag(flag, flags)) {
@ -185,7 +188,7 @@ final class Stemmer {
* @param recursionDepth Level of recursion this stemming step is at
* @return List of stems for the word, or an empty list if none are found
*/
public List<CharsRef> applyAffix(char strippedWord[], int length, char affix, int recursionDepth) {
public List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int recursionDepth) {
segment.setLength(0);
segment.append(strippedWord, 0, length);

View File

@ -179,6 +179,9 @@ public class TestAllDictionaries extends LuceneTestCase {
System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
"words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
"flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
"strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " +
"conditions=" + RamUsageEstimator.humanSizeOf(dic.patterns) + ", " +
"affixData=" + RamUsageEstimator.humanSizeOf(dic.affixData) + ", " +
"prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +
"suffixes=" + RamUsageEstimator.humanSizeOf(dic.suffixes) + ")");
}

View File

@ -32,8 +32,8 @@ public class TestDictionary extends LuceneTestCase {
InputStream dictStream = getClass().getResourceAsStream("simple.dic");
Dictionary dictionary = new Dictionary(affixStream, dictStream);
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length);
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length);
char flags[] = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef());
assertNotNull(flags);
assertEquals(1, flags.length);
@ -48,8 +48,8 @@ public class TestDictionary extends LuceneTestCase {
InputStream dictStream = getClass().getResourceAsStream("compressed.dic");
Dictionary dictionary = new Dictionary(affixStream, dictStream);
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length);
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length);
assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef()).length);
affixStream.close();