From cdec14902bc86e7826c3194199dceaa40991c153 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 27 Feb 2014 17:19:15 +0000 Subject: [PATCH] LUCENE-5468: encode affix data as 8 bytes per affix, before cutting over to FST git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572660 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/hunspell2/Affix.java | 113 ------------------ .../lucene/analysis/hunspell2/Dictionary.java | 64 ++++++---- .../lucene/analysis/hunspell2/Stemmer.java | 43 +++++-- 3 files changed, 71 insertions(+), 149 deletions(-) delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java deleted file mode 100644 index eb67f60e763..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java +++ /dev/null @@ -1,113 +0,0 @@ -package org.apache.lucene.analysis.hunspell2; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Wrapper class representing a hunspell affix - */ -final class Affix { - - private int appendFlags; // continuation class flags - private int condition; // check condition - private boolean crossProduct; - private char flag; - private int strip; - - /** - * Returns the flags defined for the affix append - * - * @return Flags defined for the affix append - */ - public int getAppendFlags() { - return appendFlags; - } - - /** - * Sets the flags defined for the affix append - * - * @param appendFlags Flags defined for the affix append - */ - public void setAppendFlags(int appendFlags) { - this.appendFlags = appendFlags; - } - - /** - * Returns the stripping characters defined for the affix - * - * @return Stripping characters defined for the affix - */ - public int getStrip() { - return strip; - } - - /** - * Sets the stripping characters defined for the affix - * - * @param strip Stripping characters defined for the affix - */ - public void setStrip(int strip) { - this.strip = strip; - } - - /** - * Sets the condition that must be met before the affix can be applied - */ - public void setCondition(int condition) { - this.condition = condition; - } - - public int getCondition() { - return condition; - } - - /** - * Returns the affix flag - * - * @return Affix flag - */ - public char getFlag() { - return flag; - } - - /** - * Sets the affix flag - * - * @param flag Affix flag - */ - public void setFlag(char flag) { - this.flag = flag; - } - - /** - * Returns whether the affix is defined as cross product - * - * @return {@code true} if the affix is cross product, {@code false} otherwise - */ - public boolean isCrossProduct() { - return crossProduct; - } - - /** - * Sets whether the affix is defined as cross product - * - * @param crossProduct Whether the affix is defined as cross product - */ - public void setCrossProduct(boolean crossProduct) { - this.crossProduct = crossProduct; - } -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java index 35c7aee6081..b30bdaa1e92 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java @@ -18,6 +18,8 @@ package org.apache.lucene.analysis.hunspell2; */ import org.apache.lucene.analysis.util.CharArrayMap; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.IOUtils; @@ -66,8 +68,8 @@ public class Dictionary { private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*"; private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s"; - public CharArrayMap> prefixes; - public CharArrayMap> suffixes; + public CharArrayMap> prefixes; + public CharArrayMap> suffixes; // all Patterns used by prefixes and suffixes. these are typically re-used across // many affix stripping rules. so these are deduplicated, to save RAM. @@ -84,6 +86,10 @@ public class Dictionary { // the list of unique strip affixes. public BytesRefHash stripLookup = new BytesRefHash(); + + // 8 bytes per affix + public byte[] affixData = new byte[64]; + private int currentAffix = 0; private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy @@ -169,7 +175,7 @@ public class Dictionary { * @param length Length from the offset that the String is * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found */ - public List lookupPrefix(char word[], int offset, int length) { + public List lookupPrefix(char word[], int offset, int length) { return prefixes.get(word, offset, length); } @@ -181,7 +187,7 @@ public class Dictionary { * @param length Length from the offset that the String is * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found */ - List lookupSuffix(char word[], int offset, int length) { + List lookupSuffix(char word[], int offset, int length) { return suffixes.get(word, offset, length); } @@ -193,8 +199,8 @@ public class Dictionary { * @throws IOException Can be thrown while reading from the InputStream */ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException { - prefixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); - suffixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); + prefixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); + suffixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); Map seenPatterns = new HashMap<>(); LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder)); @@ -225,7 +231,7 @@ public class Dictionary { * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ - private void parseAffix(CharArrayMap> affixes, + private void parseAffix(CharArrayMap> affixes, String header, LineNumberReader reader, String conditionPattern, @@ -237,14 +243,20 @@ public class Dictionary { boolean crossProduct = args[2].equals("Y"); int numLines = Integer.parseInt(args[3]); + affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3)); + ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); + for (int i = 0; i < numLines; i++) { + if (currentAffix > Short.MAX_VALUE) { + throw new UnsupportedOperationException("Too many affixes, please report this to dev@lucene.apache.org"); + } + assert affixWriter.getPosition() == currentAffix << 3; String line = reader.readLine(); String ruleArgs[] = line.split("\\s+"); if (ruleArgs.length < 5) { throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber()); } - char flag = flagParsingStrategy.parseFlag(ruleArgs[1]); String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2]; @@ -285,36 +297,42 @@ public class Dictionary { patterns.add(pattern); } - Affix affix = new Affix(); scratch.copyChars(strip); - int ord = stripLookup.add(scratch); - if (ord < 0) { + int stripOrd = stripLookup.add(scratch); + if (stripOrd < 0) { // already exists in our hash - ord = (-ord)-1; + stripOrd = (-stripOrd)-1; } - affix.setStrip(ord); - affix.setFlag(flag); - affix.setCondition(patternIndex); - affix.setCrossProduct(crossProduct); + if (appendFlags == null) { appendFlags = NOFLAGS; } final int hashCode = encodeFlagsWithHash(scratch, appendFlags); - ord = flagLookup.add(scratch, hashCode); - if (ord < 0) { + int appendFlagsOrd = flagLookup.add(scratch, hashCode); + if (appendFlagsOrd < 0) { // already exists in our hash - ord = (-ord)-1; + appendFlagsOrd = (-appendFlagsOrd)-1; + } else if (appendFlagsOrd > Short.MAX_VALUE) { + // this limit is probably flexible, but its a good sanity check too + throw new UnsupportedOperationException("Too many unique flags, please report this to dev@lucene.apache.org"); } - affix.setAppendFlags(ord); - List list = affixes.get(affixArg); + affixWriter.writeShort((short)flag); + affixWriter.writeShort((short)stripOrd); + // encode crossProduct into patternIndex + int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0); + affixWriter.writeShort((short)patternOrd); + affixWriter.writeShort((short)appendFlagsOrd); + + List list = affixes.get(affixArg); if (list == null) { - list = new ArrayList(); + list = new ArrayList(); affixes.put(affixArg, list); } - list.add(affix); + list.add((char)currentAffix); + currentAffix++; } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java index 54dce381b1a..4eaff6a9e95 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java @@ -24,6 +24,7 @@ import java.util.List; import java.util.regex.Pattern; import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.Version; @@ -37,6 +38,7 @@ final class Stemmer { private final Dictionary dictionary; private BytesRef scratch = new BytesRef(); private final StringBuilder segment = new StringBuilder(); + private final ByteArrayDataInput affixReader; /** * Constructs a new Stemmer which will use the provided Dictionary to create its stems. Uses the @@ -56,6 +58,7 @@ final class Stemmer { */ public Stemmer(Dictionary dictionary, int recursionCap) { this.dictionary = dictionary; + this.affixReader = new ByteArrayDataInput(dictionary.affixData); this.recursionCap = recursionCap; } @@ -122,17 +125,20 @@ final class Stemmer { List stems = new ArrayList(); for (int i = 0; i < length; i++) { - List suffixes = dictionary.lookupSuffix(word, i, length - i); + List suffixes = dictionary.lookupSuffix(word, i, length - i); if (suffixes == null) { continue; } - for (Affix suffix : suffixes) { - if (hasCrossCheckedFlag(suffix.getFlag(), flags)) { + for (Character suffix : suffixes) { + affixReader.setPosition(8 * suffix); + char flag = (char) (affixReader.readShort() & 0xffff); + if (hasCrossCheckedFlag(flag, flags)) { int appendLength = length - i; int deAffixedLength = length - appendLength; // TODO: can we do this in-place? - dictionary.stripLookup.get(suffix.getStrip(), scratch); + char stripOrd = (char) (affixReader.readShort() & 0xffff); + dictionary.stripLookup.get(stripOrd, scratch); String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString(); List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth); @@ -143,17 +149,20 @@ final class Stemmer { } for (int i = length - 1; i >= 0; i--) { - List prefixes = dictionary.lookupPrefix(word, 0, i); + List prefixes = dictionary.lookupPrefix(word, 0, i); if (prefixes == null) { continue; } - for (Affix prefix : prefixes) { - if (hasCrossCheckedFlag(prefix.getFlag(), flags)) { + for (Character prefix : prefixes) { + affixReader.setPosition(8 * prefix); + char flag = (char) (affixReader.readShort() & 0xffff); + if (hasCrossCheckedFlag(flag, flags)) { int deAffixedStart = i; int deAffixedLength = length - deAffixedStart; + char stripOrd = (char) (affixReader.readShort() & 0xffff); - dictionary.stripLookup.get(prefix.getStrip(), scratch); + dictionary.stripLookup.get(stripOrd, scratch); String strippedWord = new StringBuilder().append(scratch.utf8ToString()) .append(word, deAffixedStart, deAffixedLength) .toString(); @@ -176,11 +185,19 @@ final class Stemmer { * @param recursionDepth Level of recursion this stemming step is at * @return List of stems for the word, or an empty list if none are found */ - public List applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) { + public List applyAffix(char strippedWord[], int length, char affix, int recursionDepth) { segment.setLength(0); segment.append(strippedWord, 0, length); - Pattern pattern = dictionary.patterns.get(affix.getCondition()); + affixReader.setPosition(8 * affix); + char flag = (char) (affixReader.readShort() & 0xffff); + affixReader.skipBytes(2); // strip + int condition = (char) (affixReader.readShort() & 0xffff); + boolean crossProduct = (condition & 1) == 1; + condition >>>= 1; + char append = (char) (affixReader.readShort() & 0xffff); + + Pattern pattern = dictionary.patterns.get(condition); if (!pattern.matcher(segment).matches()) { return Collections.emptyList(); } @@ -188,12 +205,12 @@ final class Stemmer { List stems = new ArrayList(); char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch); - if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) { + if (wordFlags != null && Dictionary.hasFlag(wordFlags, flag)) { stems.add(new CharsRef(strippedWord, 0, length)); } - if (affix.isCrossProduct() && recursionDepth < recursionCap) { - dictionary.flagLookup.get(affix.getAppendFlags(), scratch); + if (crossProduct && recursionDepth < recursionCap) { + dictionary.flagLookup.get(append, scratch); char appendFlags[] = Dictionary.decodeFlags(scratch); stems.addAll(stem(strippedWord, length, appendFlags, ++recursionDepth)); }