From b2b86dd8add14c8e16d9b794707a3948834a6e68 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 27 Feb 2014 17:53:30 +0000 Subject: [PATCH] LUCENE-5468: convert affixes to FST git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572666 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/hunspell2/Dictionary.java | 78 ++++++++++++++++--- .../lucene/analysis/hunspell2/Stemmer.java | 13 ++-- .../hunspell2/TestAllDictionaries.java | 3 + .../analysis/hunspell2/TestDictionary.java | 8 +- 4 files changed, 83 insertions(+), 19 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java index b30bdaa1e92..b9f9c82c2f5 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java @@ -31,7 +31,9 @@ import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.Version; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.IntSequenceOutputs; import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util; import java.io.*; import java.nio.charset.Charset; @@ -46,6 +48,7 @@ import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.TreeMap; import java.util.regex.Pattern; /** @@ -68,8 +71,8 @@ public class Dictionary { private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*"; private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s"; - public CharArrayMap> prefixes; - public CharArrayMap> suffixes; + public FST prefixes; + public FST suffixes; // all Patterns used by prefixes and suffixes. these are typically re-used across // many affix stripping rules. so these are deduplicated, to save RAM. @@ -137,7 +140,7 @@ public class Dictionary { ord = lookupOrd(word, offset, length); } catch (IOException ex) { /* bogus */ } if (ord == null) { - return null; + return null; } return decodeFlags(flagLookup.get(ord, scratch)); } @@ -175,8 +178,8 @@ public class Dictionary { * @param length Length from the offset that the String is * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found */ - public List lookupPrefix(char word[], int offset, int length) { - return prefixes.get(word, offset, length); + IntsRef lookupPrefix(char word[], int offset, int length) { + return lookupAffix(prefixes, word, offset, length); } /** @@ -187,8 +190,42 @@ public class Dictionary { * @param length Length from the offset that the String is * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found */ - List lookupSuffix(char word[], int offset, int length) { - return suffixes.get(word, offset, length); + IntsRef lookupSuffix(char word[], int offset, int length) { + return lookupAffix(suffixes, word, offset, length); + } + + // TODO: this is pretty stupid, considering how the stemming algorithm works + // we can speed it up to be significantly faster! + IntsRef lookupAffix(FST fst, char word[], int offset, int length) { + if (fst == null) { + return null; + } + final FST.BytesReader bytesReader = fst.getBytesReader(); + final FST.Arc arc = fst.getFirstArc(new FST.Arc()); + // Accumulate output as we go + final IntsRef NO_OUTPUT = fst.outputs.getNoOutput(); + IntsRef output = NO_OUTPUT; + + int l = offset + length; + try { + for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) { + cp = Character.codePointAt(word, i, l); + if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) { + return null; + } else if (arc.output != NO_OUTPUT) { + output = fst.outputs.add(output, arc.output); + } + } + if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) { + return null; + } else if (arc.output != NO_OUTPUT) { + return fst.outputs.add(output, arc.output); + } else { + return output; + } + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } } /** @@ -199,8 +236,8 @@ public class Dictionary { * @throws IOException Can be thrown while reading from the InputStream */ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException { - prefixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); - suffixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); + TreeMap> prefixes = new TreeMap<>(); + TreeMap> suffixes = new TreeMap<>(); Map seenPatterns = new HashMap<>(); LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder)); @@ -218,6 +255,27 @@ public class Dictionary { flagParsingStrategy = getFlagParsingStrategy(line); } } + + this.prefixes = affixFST(prefixes); + this.suffixes = affixFST(suffixes); + } + + private FST affixFST(TreeMap> affixes) throws IOException { + IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); + Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs); + + IntsRef scratch = new IntsRef(); + for (Map.Entry> entry : affixes.entrySet()) { + Util.toUTF32(entry.getKey(), scratch); + List entries = entry.getValue(); + IntsRef output = new IntsRef(entries.size()); + int upto = 0; + for (Character c : entries) { + output.ints[output.length++] = c; + } + builder.add(scratch, output); + } + return builder.finish(); } /** @@ -231,7 +289,7 @@ public class Dictionary { * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ - private void parseAffix(CharArrayMap> affixes, + private void parseAffix(TreeMap> affixes, String header, LineNumberReader reader, String conditionPattern, diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java index 4eaff6a9e95..d6b0133830a 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java @@ -27,6 +27,7 @@ import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.Version; /** @@ -125,12 +126,13 @@ final class Stemmer { List stems = new ArrayList(); for (int i = 0; i < length; i++) { - List suffixes = dictionary.lookupSuffix(word, i, length - i); + IntsRef suffixes = dictionary.lookupSuffix(word, i, length - i); if (suffixes == null) { continue; } - for (Character suffix : suffixes) { + for (int j = 0; j < suffixes.length; j++) { + int suffix = suffixes.ints[suffixes.offset + j]; affixReader.setPosition(8 * suffix); char flag = (char) (affixReader.readShort() & 0xffff); if (hasCrossCheckedFlag(flag, flags)) { @@ -149,12 +151,13 @@ final class Stemmer { } for (int i = length - 1; i >= 0; i--) { - List prefixes = dictionary.lookupPrefix(word, 0, i); + IntsRef prefixes = dictionary.lookupPrefix(word, 0, i); if (prefixes == null) { continue; } - for (Character prefix : prefixes) { + for (int j = 0; j < prefixes.length; j++) { + int prefix = prefixes.ints[prefixes.offset + j]; affixReader.setPosition(8 * prefix); char flag = (char) (affixReader.readShort() & 0xffff); if (hasCrossCheckedFlag(flag, flags)) { @@ -185,7 +188,7 @@ final class Stemmer { * @param recursionDepth Level of recursion this stemming step is at * @return List of stems for the word, or an empty list if none are found */ - public List applyAffix(char strippedWord[], int length, char affix, int recursionDepth) { + public List applyAffix(char strippedWord[], int length, int affix, int recursionDepth) { segment.setLength(0); segment.append(strippedWord, 0, length); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java index 9f9bce98236..d00fc634944 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java @@ -179,6 +179,9 @@ public class TestAllDictionaries extends LuceneTestCase { System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" + "words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " + "flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " + + "strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " + + "conditions=" + RamUsageEstimator.humanSizeOf(dic.patterns) + ", " + + "affixData=" + RamUsageEstimator.humanSizeOf(dic.affixData) + ", " + "prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " + "suffixes=" + RamUsageEstimator.humanSizeOf(dic.suffixes) + ")"); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java index 14c6e8967d0..e8e0fd0d030 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java @@ -32,8 +32,8 @@ public class TestDictionary extends LuceneTestCase { InputStream dictStream = getClass().getResourceAsStream("simple.dic"); Dictionary dictionary = new Dictionary(affixStream, dictStream); - assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size()); - assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size()); + assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length); + assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length); char flags[] = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef()); assertNotNull(flags); assertEquals(1, flags.length); @@ -48,8 +48,8 @@ public class TestDictionary extends LuceneTestCase { InputStream dictStream = getClass().getResourceAsStream("compressed.dic"); Dictionary dictionary = new Dictionary(affixStream, dictStream); - assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size()); - assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size()); + assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length); + assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length); assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef()).length); affixStream.close();