mirror of https://github.com/apache/lucene.git
LUCENE-5468: convert affixes to FST
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572666 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cdec14902b
commit
b2b86dd8ad
|
@ -31,7 +31,9 @@ import org.apache.lucene.util.UnicodeUtil;
|
|||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.IntSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.charset.Charset;
|
||||
|
@ -46,6 +48,7 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
|
@ -68,8 +71,8 @@ public class Dictionary {
|
|||
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
|
||||
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
|
||||
|
||||
public CharArrayMap<List<Character>> prefixes;
|
||||
public CharArrayMap<List<Character>> suffixes;
|
||||
public FST<IntsRef> prefixes;
|
||||
public FST<IntsRef> suffixes;
|
||||
|
||||
// all Patterns used by prefixes and suffixes. these are typically re-used across
|
||||
// many affix stripping rules. so these are deduplicated, to save RAM.
|
||||
|
@ -137,7 +140,7 @@ public class Dictionary {
|
|||
ord = lookupOrd(word, offset, length);
|
||||
} catch (IOException ex) { /* bogus */ }
|
||||
if (ord == null) {
|
||||
return null;
|
||||
return null;
|
||||
}
|
||||
return decodeFlags(flagLookup.get(ord, scratch));
|
||||
}
|
||||
|
@ -175,8 +178,8 @@ public class Dictionary {
|
|||
* @param length Length from the offset that the String is
|
||||
* @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
|
||||
*/
|
||||
public List<Character> lookupPrefix(char word[], int offset, int length) {
|
||||
return prefixes.get(word, offset, length);
|
||||
IntsRef lookupPrefix(char word[], int offset, int length) {
|
||||
return lookupAffix(prefixes, word, offset, length);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -187,8 +190,42 @@ public class Dictionary {
|
|||
* @param length Length from the offset that the String is
|
||||
* @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
|
||||
*/
|
||||
List<Character> lookupSuffix(char word[], int offset, int length) {
|
||||
return suffixes.get(word, offset, length);
|
||||
IntsRef lookupSuffix(char word[], int offset, int length) {
|
||||
return lookupAffix(suffixes, word, offset, length);
|
||||
}
|
||||
|
||||
// TODO: this is pretty stupid, considering how the stemming algorithm works
|
||||
// we can speed it up to be significantly faster!
|
||||
IntsRef lookupAffix(FST<IntsRef> fst, char word[], int offset, int length) {
|
||||
if (fst == null) {
|
||||
return null;
|
||||
}
|
||||
final FST.BytesReader bytesReader = fst.getBytesReader();
|
||||
final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>());
|
||||
// Accumulate output as we go
|
||||
final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
|
||||
IntsRef output = NO_OUTPUT;
|
||||
|
||||
int l = offset + length;
|
||||
try {
|
||||
for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
|
||||
cp = Character.codePointAt(word, i, l);
|
||||
if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
|
||||
return null;
|
||||
} else if (arc.output != NO_OUTPUT) {
|
||||
output = fst.outputs.add(output, arc.output);
|
||||
}
|
||||
}
|
||||
if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
|
||||
return null;
|
||||
} else if (arc.output != NO_OUTPUT) {
|
||||
return fst.outputs.add(output, arc.output);
|
||||
} else {
|
||||
return output;
|
||||
}
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -199,8 +236,8 @@ public class Dictionary {
|
|||
* @throws IOException Can be thrown while reading from the InputStream
|
||||
*/
|
||||
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
|
||||
prefixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
|
||||
suffixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
|
||||
TreeMap<String, List<Character>> prefixes = new TreeMap<>();
|
||||
TreeMap<String, List<Character>> suffixes = new TreeMap<>();
|
||||
Map<String,Integer> seenPatterns = new HashMap<>();
|
||||
|
||||
LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
|
||||
|
@ -218,6 +255,27 @@ public class Dictionary {
|
|||
flagParsingStrategy = getFlagParsingStrategy(line);
|
||||
}
|
||||
}
|
||||
|
||||
this.prefixes = affixFST(prefixes);
|
||||
this.suffixes = affixFST(suffixes);
|
||||
}
|
||||
|
||||
private FST<IntsRef> affixFST(TreeMap<String,List<Character>> affixes) throws IOException {
|
||||
IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
|
||||
Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
|
||||
|
||||
IntsRef scratch = new IntsRef();
|
||||
for (Map.Entry<String,List<Character>> entry : affixes.entrySet()) {
|
||||
Util.toUTF32(entry.getKey(), scratch);
|
||||
List<Character> entries = entry.getValue();
|
||||
IntsRef output = new IntsRef(entries.size());
|
||||
int upto = 0;
|
||||
for (Character c : entries) {
|
||||
output.ints[output.length++] = c;
|
||||
}
|
||||
builder.add(scratch, output);
|
||||
}
|
||||
return builder.finish();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -231,7 +289,7 @@ public class Dictionary {
|
|||
* @param seenPatterns map from condition -> index of patterns, for deduplication.
|
||||
* @throws IOException Can be thrown while reading the rule
|
||||
*/
|
||||
private void parseAffix(CharArrayMap<List<Character>> affixes,
|
||||
private void parseAffix(TreeMap<String,List<Character>> affixes,
|
||||
String header,
|
||||
LineNumberReader reader,
|
||||
String conditionPattern,
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -125,12 +126,13 @@ final class Stemmer {
|
|||
List<CharsRef> stems = new ArrayList<CharsRef>();
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
List<Character> suffixes = dictionary.lookupSuffix(word, i, length - i);
|
||||
IntsRef suffixes = dictionary.lookupSuffix(word, i, length - i);
|
||||
if (suffixes == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (Character suffix : suffixes) {
|
||||
for (int j = 0; j < suffixes.length; j++) {
|
||||
int suffix = suffixes.ints[suffixes.offset + j];
|
||||
affixReader.setPosition(8 * suffix);
|
||||
char flag = (char) (affixReader.readShort() & 0xffff);
|
||||
if (hasCrossCheckedFlag(flag, flags)) {
|
||||
|
@ -149,12 +151,13 @@ final class Stemmer {
|
|||
}
|
||||
|
||||
for (int i = length - 1; i >= 0; i--) {
|
||||
List<Character> prefixes = dictionary.lookupPrefix(word, 0, i);
|
||||
IntsRef prefixes = dictionary.lookupPrefix(word, 0, i);
|
||||
if (prefixes == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (Character prefix : prefixes) {
|
||||
for (int j = 0; j < prefixes.length; j++) {
|
||||
int prefix = prefixes.ints[prefixes.offset + j];
|
||||
affixReader.setPosition(8 * prefix);
|
||||
char flag = (char) (affixReader.readShort() & 0xffff);
|
||||
if (hasCrossCheckedFlag(flag, flags)) {
|
||||
|
@ -185,7 +188,7 @@ final class Stemmer {
|
|||
* @param recursionDepth Level of recursion this stemming step is at
|
||||
* @return List of stems for the word, or an empty list if none are found
|
||||
*/
|
||||
public List<CharsRef> applyAffix(char strippedWord[], int length, char affix, int recursionDepth) {
|
||||
public List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int recursionDepth) {
|
||||
segment.setLength(0);
|
||||
segment.append(strippedWord, 0, length);
|
||||
|
||||
|
|
|
@ -179,6 +179,9 @@ public class TestAllDictionaries extends LuceneTestCase {
|
|||
System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
|
||||
"words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
|
||||
"flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
|
||||
"strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " +
|
||||
"conditions=" + RamUsageEstimator.humanSizeOf(dic.patterns) + ", " +
|
||||
"affixData=" + RamUsageEstimator.humanSizeOf(dic.affixData) + ", " +
|
||||
"prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +
|
||||
"suffixes=" + RamUsageEstimator.humanSizeOf(dic.suffixes) + ")");
|
||||
}
|
||||
|
|
|
@ -32,8 +32,8 @@ public class TestDictionary extends LuceneTestCase {
|
|||
InputStream dictStream = getClass().getResourceAsStream("simple.dic");
|
||||
|
||||
Dictionary dictionary = new Dictionary(affixStream, dictStream);
|
||||
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
|
||||
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
|
||||
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length);
|
||||
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length);
|
||||
char flags[] = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef());
|
||||
assertNotNull(flags);
|
||||
assertEquals(1, flags.length);
|
||||
|
@ -48,8 +48,8 @@ public class TestDictionary extends LuceneTestCase {
|
|||
InputStream dictStream = getClass().getResourceAsStream("compressed.dic");
|
||||
|
||||
Dictionary dictionary = new Dictionary(affixStream, dictStream);
|
||||
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
|
||||
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
|
||||
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length);
|
||||
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length);
|
||||
assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef()).length);
|
||||
|
||||
affixStream.close();
|
||||
|
|
Loading…
Reference in New Issue