LUCENE-5468: encode affix data as 8 bytes per affix, before cutting over to FST

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572660 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-02-27 17:19:15 +00:00
parent 9896e610d3
commit cdec14902b
3 changed files with 71 additions and 149 deletions

View File

@ -1,113 +0,0 @@
package org.apache.lucene.analysis.hunspell2;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Wrapper class representing a hunspell affix
*/
final class Affix {
private int appendFlags; // continuation class flags
private int condition; // check condition
private boolean crossProduct;
private char flag;
private int strip;
/**
* Returns the flags defined for the affix append
*
* @return Flags defined for the affix append
*/
public int getAppendFlags() {
return appendFlags;
}
/**
* Sets the flags defined for the affix append
*
* @param appendFlags Flags defined for the affix append
*/
public void setAppendFlags(int appendFlags) {
this.appendFlags = appendFlags;
}
/**
* Returns the stripping characters defined for the affix
*
* @return Stripping characters defined for the affix
*/
public int getStrip() {
return strip;
}
/**
* Sets the stripping characters defined for the affix
*
* @param strip Stripping characters defined for the affix
*/
public void setStrip(int strip) {
this.strip = strip;
}
/**
* Sets the condition that must be met before the affix can be applied
*/
public void setCondition(int condition) {
this.condition = condition;
}
public int getCondition() {
return condition;
}
/**
* Returns the affix flag
*
* @return Affix flag
*/
public char getFlag() {
return flag;
}
/**
* Sets the affix flag
*
* @param flag Affix flag
*/
public void setFlag(char flag) {
this.flag = flag;
}
/**
* Returns whether the affix is defined as cross product
*
* @return {@code true} if the affix is cross product, {@code false} otherwise
*/
public boolean isCrossProduct() {
return crossProduct;
}
/**
* Sets whether the affix is defined as cross product
*
* @param crossProduct Whether the affix is defined as cross product
*/
public void setCrossProduct(boolean crossProduct) {
this.crossProduct = crossProduct;
}
}

View File

@ -18,6 +18,8 @@ package org.apache.lucene.analysis.hunspell2;
*/
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.IOUtils;
@ -66,8 +68,8 @@ public class Dictionary {
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
public CharArrayMap<List<Affix>> prefixes;
public CharArrayMap<List<Affix>> suffixes;
public CharArrayMap<List<Character>> prefixes;
public CharArrayMap<List<Character>> suffixes;
// all Patterns used by prefixes and suffixes. these are typically re-used across
// many affix stripping rules. so these are deduplicated, to save RAM.
@ -85,6 +87,10 @@ public class Dictionary {
// the list of unique strip affixes.
public BytesRefHash stripLookup = new BytesRefHash();
// 8 bytes per affix
public byte[] affixData = new byte[64];
private int currentAffix = 0;
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
private String[] aliases;
@ -169,7 +175,7 @@ public class Dictionary {
* @param length Length from the offset that the String is
* @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
*/
public List<Affix> lookupPrefix(char word[], int offset, int length) {
public List<Character> lookupPrefix(char word[], int offset, int length) {
return prefixes.get(word, offset, length);
}
@ -181,7 +187,7 @@ public class Dictionary {
* @param length Length from the offset that the String is
* @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
*/
List<Affix> lookupSuffix(char word[], int offset, int length) {
List<Character> lookupSuffix(char word[], int offset, int length) {
return suffixes.get(word, offset, length);
}
@ -193,8 +199,8 @@ public class Dictionary {
* @throws IOException Can be thrown while reading from the InputStream
*/
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
prefixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
suffixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
prefixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
suffixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
Map<String,Integer> seenPatterns = new HashMap<>();
LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
@ -225,7 +231,7 @@ public class Dictionary {
* @param seenPatterns map from condition -> index of patterns, for deduplication.
* @throws IOException Can be thrown while reading the rule
*/
private void parseAffix(CharArrayMap<List<Affix>> affixes,
private void parseAffix(CharArrayMap<List<Character>> affixes,
String header,
LineNumberReader reader,
String conditionPattern,
@ -237,7 +243,14 @@ public class Dictionary {
boolean crossProduct = args[2].equals("Y");
int numLines = Integer.parseInt(args[3]);
affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);
for (int i = 0; i < numLines; i++) {
if (currentAffix > Short.MAX_VALUE) {
throw new UnsupportedOperationException("Too many affixes, please report this to dev@lucene.apache.org");
}
assert affixWriter.getPosition() == currentAffix << 3;
String line = reader.readLine();
String ruleArgs[] = line.split("\\s+");
@ -245,7 +258,6 @@ public class Dictionary {
throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber());
}
char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
String affixArg = ruleArgs[3];
@ -285,36 +297,42 @@ public class Dictionary {
patterns.add(pattern);
}
Affix affix = new Affix();
scratch.copyChars(strip);
int ord = stripLookup.add(scratch);
if (ord < 0) {
int stripOrd = stripLookup.add(scratch);
if (stripOrd < 0) {
// already exists in our hash
ord = (-ord)-1;
stripOrd = (-stripOrd)-1;
}
affix.setStrip(ord);
affix.setFlag(flag);
affix.setCondition(patternIndex);
affix.setCrossProduct(crossProduct);
if (appendFlags == null) {
appendFlags = NOFLAGS;
}
final int hashCode = encodeFlagsWithHash(scratch, appendFlags);
ord = flagLookup.add(scratch, hashCode);
if (ord < 0) {
int appendFlagsOrd = flagLookup.add(scratch, hashCode);
if (appendFlagsOrd < 0) {
// already exists in our hash
ord = (-ord)-1;
appendFlagsOrd = (-appendFlagsOrd)-1;
} else if (appendFlagsOrd > Short.MAX_VALUE) {
// this limit is probably flexible, but its a good sanity check too
throw new UnsupportedOperationException("Too many unique flags, please report this to dev@lucene.apache.org");
}
affix.setAppendFlags(ord);
List<Affix> list = affixes.get(affixArg);
affixWriter.writeShort((short)flag);
affixWriter.writeShort((short)stripOrd);
// encode crossProduct into patternIndex
int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
affixWriter.writeShort((short)patternOrd);
affixWriter.writeShort((short)appendFlagsOrd);
List<Character> list = affixes.get(affixArg);
if (list == null) {
list = new ArrayList<Affix>();
list = new ArrayList<Character>();
affixes.put(affixArg, list);
}
list.add(affix);
list.add((char)currentAffix);
currentAffix++;
}
}

View File

@ -24,6 +24,7 @@ import java.util.List;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.Version;
@ -37,6 +38,7 @@ final class Stemmer {
private final Dictionary dictionary;
private BytesRef scratch = new BytesRef();
private final StringBuilder segment = new StringBuilder();
private final ByteArrayDataInput affixReader;
/**
* Constructs a new Stemmer which will use the provided Dictionary to create its stems. Uses the
@ -56,6 +58,7 @@ final class Stemmer {
*/
public Stemmer(Dictionary dictionary, int recursionCap) {
this.dictionary = dictionary;
this.affixReader = new ByteArrayDataInput(dictionary.affixData);
this.recursionCap = recursionCap;
}
@ -122,17 +125,20 @@ final class Stemmer {
List<CharsRef> stems = new ArrayList<CharsRef>();
for (int i = 0; i < length; i++) {
List<Affix> suffixes = dictionary.lookupSuffix(word, i, length - i);
List<Character> suffixes = dictionary.lookupSuffix(word, i, length - i);
if (suffixes == null) {
continue;
}
for (Affix suffix : suffixes) {
if (hasCrossCheckedFlag(suffix.getFlag(), flags)) {
for (Character suffix : suffixes) {
affixReader.setPosition(8 * suffix);
char flag = (char) (affixReader.readShort() & 0xffff);
if (hasCrossCheckedFlag(flag, flags)) {
int appendLength = length - i;
int deAffixedLength = length - appendLength;
// TODO: can we do this in-place?
dictionary.stripLookup.get(suffix.getStrip(), scratch);
char stripOrd = (char) (affixReader.readShort() & 0xffff);
dictionary.stripLookup.get(stripOrd, scratch);
String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString();
List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
@ -143,17 +149,20 @@ final class Stemmer {
}
for (int i = length - 1; i >= 0; i--) {
List<Affix> prefixes = dictionary.lookupPrefix(word, 0, i);
List<Character> prefixes = dictionary.lookupPrefix(word, 0, i);
if (prefixes == null) {
continue;
}
for (Affix prefix : prefixes) {
if (hasCrossCheckedFlag(prefix.getFlag(), flags)) {
for (Character prefix : prefixes) {
affixReader.setPosition(8 * prefix);
char flag = (char) (affixReader.readShort() & 0xffff);
if (hasCrossCheckedFlag(flag, flags)) {
int deAffixedStart = i;
int deAffixedLength = length - deAffixedStart;
char stripOrd = (char) (affixReader.readShort() & 0xffff);
dictionary.stripLookup.get(prefix.getStrip(), scratch);
dictionary.stripLookup.get(stripOrd, scratch);
String strippedWord = new StringBuilder().append(scratch.utf8ToString())
.append(word, deAffixedStart, deAffixedLength)
.toString();
@ -176,11 +185,19 @@ final class Stemmer {
* @param recursionDepth Level of recursion this stemming step is at
* @return List of stems for the word, or an empty list if none are found
*/
public List<CharsRef> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
public List<CharsRef> applyAffix(char strippedWord[], int length, char affix, int recursionDepth) {
segment.setLength(0);
segment.append(strippedWord, 0, length);
Pattern pattern = dictionary.patterns.get(affix.getCondition());
affixReader.setPosition(8 * affix);
char flag = (char) (affixReader.readShort() & 0xffff);
affixReader.skipBytes(2); // strip
int condition = (char) (affixReader.readShort() & 0xffff);
boolean crossProduct = (condition & 1) == 1;
condition >>>= 1;
char append = (char) (affixReader.readShort() & 0xffff);
Pattern pattern = dictionary.patterns.get(condition);
if (!pattern.matcher(segment).matches()) {
return Collections.emptyList();
}
@ -188,12 +205,12 @@ final class Stemmer {
List<CharsRef> stems = new ArrayList<CharsRef>();
char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch);
if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) {
if (wordFlags != null && Dictionary.hasFlag(wordFlags, flag)) {
stems.add(new CharsRef(strippedWord, 0, length));
}
if (affix.isCrossProduct() && recursionDepth < recursionCap) {
dictionary.flagLookup.get(affix.getAppendFlags(), scratch);
if (crossProduct && recursionDepth < recursionCap) {
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
stems.addAll(stem(strippedWord, length, appendFlags, ++recursionDepth));
}