mirror of https://github.com/apache/lucene.git
LUCENE-5468: encode affix data as 8 bytes per affix, before cutting over to FST
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572660 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9896e610d3
commit
cdec14902b
|
@ -1,113 +0,0 @@
|
|||
package org.apache.lucene.analysis.hunspell2;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Wrapper class representing a hunspell affix
|
||||
*/
|
||||
final class Affix {
|
||||
|
||||
private int appendFlags; // continuation class flags
|
||||
private int condition; // check condition
|
||||
private boolean crossProduct;
|
||||
private char flag;
|
||||
private int strip;
|
||||
|
||||
/**
|
||||
* Returns the flags defined for the affix append
|
||||
*
|
||||
* @return Flags defined for the affix append
|
||||
*/
|
||||
public int getAppendFlags() {
|
||||
return appendFlags;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the flags defined for the affix append
|
||||
*
|
||||
* @param appendFlags Flags defined for the affix append
|
||||
*/
|
||||
public void setAppendFlags(int appendFlags) {
|
||||
this.appendFlags = appendFlags;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the stripping characters defined for the affix
|
||||
*
|
||||
* @return Stripping characters defined for the affix
|
||||
*/
|
||||
public int getStrip() {
|
||||
return strip;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the stripping characters defined for the affix
|
||||
*
|
||||
* @param strip Stripping characters defined for the affix
|
||||
*/
|
||||
public void setStrip(int strip) {
|
||||
this.strip = strip;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the condition that must be met before the affix can be applied
|
||||
*/
|
||||
public void setCondition(int condition) {
|
||||
this.condition = condition;
|
||||
}
|
||||
|
||||
public int getCondition() {
|
||||
return condition;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the affix flag
|
||||
*
|
||||
* @return Affix flag
|
||||
*/
|
||||
public char getFlag() {
|
||||
return flag;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the affix flag
|
||||
*
|
||||
* @param flag Affix flag
|
||||
*/
|
||||
public void setFlag(char flag) {
|
||||
this.flag = flag;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether the affix is defined as cross product
|
||||
*
|
||||
* @return {@code true} if the affix is cross product, {@code false} otherwise
|
||||
*/
|
||||
public boolean isCrossProduct() {
|
||||
return crossProduct;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets whether the affix is defined as cross product
|
||||
*
|
||||
* @param crossProduct Whether the affix is defined as cross product
|
||||
*/
|
||||
public void setCrossProduct(boolean crossProduct) {
|
||||
this.crossProduct = crossProduct;
|
||||
}
|
||||
}
|
|
@ -18,6 +18,8 @@ package org.apache.lucene.analysis.hunspell2;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefHash;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -66,8 +68,8 @@ public class Dictionary {
|
|||
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
|
||||
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
|
||||
|
||||
public CharArrayMap<List<Affix>> prefixes;
|
||||
public CharArrayMap<List<Affix>> suffixes;
|
||||
public CharArrayMap<List<Character>> prefixes;
|
||||
public CharArrayMap<List<Character>> suffixes;
|
||||
|
||||
// all Patterns used by prefixes and suffixes. these are typically re-used across
|
||||
// many affix stripping rules. so these are deduplicated, to save RAM.
|
||||
|
@ -85,6 +87,10 @@ public class Dictionary {
|
|||
// the list of unique strip affixes.
|
||||
public BytesRefHash stripLookup = new BytesRefHash();
|
||||
|
||||
// 8 bytes per affix
|
||||
public byte[] affixData = new byte[64];
|
||||
private int currentAffix = 0;
|
||||
|
||||
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
|
||||
|
||||
private String[] aliases;
|
||||
|
@ -169,7 +175,7 @@ public class Dictionary {
|
|||
* @param length Length from the offset that the String is
|
||||
* @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
|
||||
*/
|
||||
public List<Affix> lookupPrefix(char word[], int offset, int length) {
|
||||
public List<Character> lookupPrefix(char word[], int offset, int length) {
|
||||
return prefixes.get(word, offset, length);
|
||||
}
|
||||
|
||||
|
@ -181,7 +187,7 @@ public class Dictionary {
|
|||
* @param length Length from the offset that the String is
|
||||
* @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
|
||||
*/
|
||||
List<Affix> lookupSuffix(char word[], int offset, int length) {
|
||||
List<Character> lookupSuffix(char word[], int offset, int length) {
|
||||
return suffixes.get(word, offset, length);
|
||||
}
|
||||
|
||||
|
@ -193,8 +199,8 @@ public class Dictionary {
|
|||
* @throws IOException Can be thrown while reading from the InputStream
|
||||
*/
|
||||
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
|
||||
prefixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
|
||||
suffixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
|
||||
prefixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
|
||||
suffixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
|
||||
Map<String,Integer> seenPatterns = new HashMap<>();
|
||||
|
||||
LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
|
||||
|
@ -225,7 +231,7 @@ public class Dictionary {
|
|||
* @param seenPatterns map from condition -> index of patterns, for deduplication.
|
||||
* @throws IOException Can be thrown while reading the rule
|
||||
*/
|
||||
private void parseAffix(CharArrayMap<List<Affix>> affixes,
|
||||
private void parseAffix(CharArrayMap<List<Character>> affixes,
|
||||
String header,
|
||||
LineNumberReader reader,
|
||||
String conditionPattern,
|
||||
|
@ -237,7 +243,14 @@ public class Dictionary {
|
|||
boolean crossProduct = args[2].equals("Y");
|
||||
|
||||
int numLines = Integer.parseInt(args[3]);
|
||||
affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
|
||||
ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);
|
||||
|
||||
for (int i = 0; i < numLines; i++) {
|
||||
if (currentAffix > Short.MAX_VALUE) {
|
||||
throw new UnsupportedOperationException("Too many affixes, please report this to dev@lucene.apache.org");
|
||||
}
|
||||
assert affixWriter.getPosition() == currentAffix << 3;
|
||||
String line = reader.readLine();
|
||||
String ruleArgs[] = line.split("\\s+");
|
||||
|
||||
|
@ -245,7 +258,6 @@ public class Dictionary {
|
|||
throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber());
|
||||
}
|
||||
|
||||
|
||||
char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
|
||||
String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
|
||||
String affixArg = ruleArgs[3];
|
||||
|
@ -285,36 +297,42 @@ public class Dictionary {
|
|||
patterns.add(pattern);
|
||||
}
|
||||
|
||||
Affix affix = new Affix();
|
||||
scratch.copyChars(strip);
|
||||
int ord = stripLookup.add(scratch);
|
||||
if (ord < 0) {
|
||||
int stripOrd = stripLookup.add(scratch);
|
||||
if (stripOrd < 0) {
|
||||
// already exists in our hash
|
||||
ord = (-ord)-1;
|
||||
stripOrd = (-stripOrd)-1;
|
||||
}
|
||||
affix.setStrip(ord);
|
||||
affix.setFlag(flag);
|
||||
affix.setCondition(patternIndex);
|
||||
affix.setCrossProduct(crossProduct);
|
||||
|
||||
if (appendFlags == null) {
|
||||
appendFlags = NOFLAGS;
|
||||
}
|
||||
|
||||
final int hashCode = encodeFlagsWithHash(scratch, appendFlags);
|
||||
ord = flagLookup.add(scratch, hashCode);
|
||||
if (ord < 0) {
|
||||
int appendFlagsOrd = flagLookup.add(scratch, hashCode);
|
||||
if (appendFlagsOrd < 0) {
|
||||
// already exists in our hash
|
||||
ord = (-ord)-1;
|
||||
appendFlagsOrd = (-appendFlagsOrd)-1;
|
||||
} else if (appendFlagsOrd > Short.MAX_VALUE) {
|
||||
// this limit is probably flexible, but its a good sanity check too
|
||||
throw new UnsupportedOperationException("Too many unique flags, please report this to dev@lucene.apache.org");
|
||||
}
|
||||
affix.setAppendFlags(ord);
|
||||
|
||||
List<Affix> list = affixes.get(affixArg);
|
||||
affixWriter.writeShort((short)flag);
|
||||
affixWriter.writeShort((short)stripOrd);
|
||||
// encode crossProduct into patternIndex
|
||||
int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
|
||||
affixWriter.writeShort((short)patternOrd);
|
||||
affixWriter.writeShort((short)appendFlagsOrd);
|
||||
|
||||
List<Character> list = affixes.get(affixArg);
|
||||
if (list == null) {
|
||||
list = new ArrayList<Affix>();
|
||||
list = new ArrayList<Character>();
|
||||
affixes.put(affixArg, list);
|
||||
}
|
||||
|
||||
list.add(affix);
|
||||
list.add((char)currentAffix);
|
||||
currentAffix++;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.util.List;
|
|||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
@ -37,6 +38,7 @@ final class Stemmer {
|
|||
private final Dictionary dictionary;
|
||||
private BytesRef scratch = new BytesRef();
|
||||
private final StringBuilder segment = new StringBuilder();
|
||||
private final ByteArrayDataInput affixReader;
|
||||
|
||||
/**
|
||||
* Constructs a new Stemmer which will use the provided Dictionary to create its stems. Uses the
|
||||
|
@ -56,6 +58,7 @@ final class Stemmer {
|
|||
*/
|
||||
public Stemmer(Dictionary dictionary, int recursionCap) {
|
||||
this.dictionary = dictionary;
|
||||
this.affixReader = new ByteArrayDataInput(dictionary.affixData);
|
||||
this.recursionCap = recursionCap;
|
||||
}
|
||||
|
||||
|
@ -122,17 +125,20 @@ final class Stemmer {
|
|||
List<CharsRef> stems = new ArrayList<CharsRef>();
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
List<Affix> suffixes = dictionary.lookupSuffix(word, i, length - i);
|
||||
List<Character> suffixes = dictionary.lookupSuffix(word, i, length - i);
|
||||
if (suffixes == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (Affix suffix : suffixes) {
|
||||
if (hasCrossCheckedFlag(suffix.getFlag(), flags)) {
|
||||
for (Character suffix : suffixes) {
|
||||
affixReader.setPosition(8 * suffix);
|
||||
char flag = (char) (affixReader.readShort() & 0xffff);
|
||||
if (hasCrossCheckedFlag(flag, flags)) {
|
||||
int appendLength = length - i;
|
||||
int deAffixedLength = length - appendLength;
|
||||
// TODO: can we do this in-place?
|
||||
dictionary.stripLookup.get(suffix.getStrip(), scratch);
|
||||
char stripOrd = (char) (affixReader.readShort() & 0xffff);
|
||||
dictionary.stripLookup.get(stripOrd, scratch);
|
||||
String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString();
|
||||
|
||||
List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
|
||||
|
@ -143,17 +149,20 @@ final class Stemmer {
|
|||
}
|
||||
|
||||
for (int i = length - 1; i >= 0; i--) {
|
||||
List<Affix> prefixes = dictionary.lookupPrefix(word, 0, i);
|
||||
List<Character> prefixes = dictionary.lookupPrefix(word, 0, i);
|
||||
if (prefixes == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (Affix prefix : prefixes) {
|
||||
if (hasCrossCheckedFlag(prefix.getFlag(), flags)) {
|
||||
for (Character prefix : prefixes) {
|
||||
affixReader.setPosition(8 * prefix);
|
||||
char flag = (char) (affixReader.readShort() & 0xffff);
|
||||
if (hasCrossCheckedFlag(flag, flags)) {
|
||||
int deAffixedStart = i;
|
||||
int deAffixedLength = length - deAffixedStart;
|
||||
char stripOrd = (char) (affixReader.readShort() & 0xffff);
|
||||
|
||||
dictionary.stripLookup.get(prefix.getStrip(), scratch);
|
||||
dictionary.stripLookup.get(stripOrd, scratch);
|
||||
String strippedWord = new StringBuilder().append(scratch.utf8ToString())
|
||||
.append(word, deAffixedStart, deAffixedLength)
|
||||
.toString();
|
||||
|
@ -176,11 +185,19 @@ final class Stemmer {
|
|||
* @param recursionDepth Level of recursion this stemming step is at
|
||||
* @return List of stems for the word, or an empty list if none are found
|
||||
*/
|
||||
public List<CharsRef> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
|
||||
public List<CharsRef> applyAffix(char strippedWord[], int length, char affix, int recursionDepth) {
|
||||
segment.setLength(0);
|
||||
segment.append(strippedWord, 0, length);
|
||||
|
||||
Pattern pattern = dictionary.patterns.get(affix.getCondition());
|
||||
affixReader.setPosition(8 * affix);
|
||||
char flag = (char) (affixReader.readShort() & 0xffff);
|
||||
affixReader.skipBytes(2); // strip
|
||||
int condition = (char) (affixReader.readShort() & 0xffff);
|
||||
boolean crossProduct = (condition & 1) == 1;
|
||||
condition >>>= 1;
|
||||
char append = (char) (affixReader.readShort() & 0xffff);
|
||||
|
||||
Pattern pattern = dictionary.patterns.get(condition);
|
||||
if (!pattern.matcher(segment).matches()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
@ -188,12 +205,12 @@ final class Stemmer {
|
|||
List<CharsRef> stems = new ArrayList<CharsRef>();
|
||||
|
||||
char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch);
|
||||
if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) {
|
||||
if (wordFlags != null && Dictionary.hasFlag(wordFlags, flag)) {
|
||||
stems.add(new CharsRef(strippedWord, 0, length));
|
||||
}
|
||||
|
||||
if (affix.isCrossProduct() && recursionDepth < recursionCap) {
|
||||
dictionary.flagLookup.get(affix.getAppendFlags(), scratch);
|
||||
if (crossProduct && recursionDepth < recursionCap) {
|
||||
dictionary.flagLookup.get(append, scratch);
|
||||
char appendFlags[] = Dictionary.decodeFlags(scratch);
|
||||
stems.addAll(stem(strippedWord, length, appendFlags, ++recursionDepth));
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue