mirror of https://github.com/apache/lucene.git
LUCENE-9677: simplify Dictionary.affixData storage (#2218)
Use char[] instead of byte[], get rid of unnecessary byte array readers/writers.
This commit is contained in:
parent
ab08fdc6f0
commit
426c902bc9
|
@ -45,7 +45,6 @@ import java.util.TreeMap;
|
|||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
|
@ -118,10 +117,16 @@ public class Dictionary {
|
|||
char[] stripData;
|
||||
int[] stripOffsets;
|
||||
|
||||
// 8 bytes per affix
|
||||
byte[] affixData = new byte[64];
|
||||
// 4 chars per affix, each char representing an unsigned 2-byte integer
|
||||
char[] affixData = new char[32];
|
||||
private int currentAffix = 0;
|
||||
|
||||
// offsets in affixData
|
||||
static final int AFFIX_FLAG = 0;
|
||||
static final int AFFIX_STRIP_ORD = 1;
|
||||
static final int AFFIX_CONDITION = 2;
|
||||
static final int AFFIX_APPEND = 3;
|
||||
|
||||
// Default flag parsing strategy
|
||||
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy();
|
||||
|
||||
|
@ -480,12 +485,9 @@ public class Dictionary {
|
|||
boolean isSuffix = conditionPattern.equals(SUFFIX_CONDITION_REGEX_PATTERN);
|
||||
|
||||
int numLines = Integer.parseInt(args[3]);
|
||||
affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
|
||||
ByteArrayDataOutput affixWriter =
|
||||
new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);
|
||||
affixData = ArrayUtil.grow(affixData, currentAffix * 4 + numLines * 4);
|
||||
|
||||
for (int i = 0; i < numLines; i++) {
|
||||
assert affixWriter.getPosition() == currentAffix << 3;
|
||||
String line = reader.readLine();
|
||||
String[] ruleArgs = line.split("\\s+");
|
||||
|
||||
|
@ -581,12 +583,13 @@ public class Dictionary {
|
|||
"Too many unique append flags, please report this to dev@lucene.apache.org");
|
||||
}
|
||||
|
||||
affixWriter.writeShort((short) flag);
|
||||
affixWriter.writeShort((short) stripOrd.intValue());
|
||||
int dataStart = currentAffix * 4;
|
||||
affixData[dataStart + AFFIX_FLAG] = flag;
|
||||
affixData[dataStart + AFFIX_STRIP_ORD] = (char) stripOrd.intValue();
|
||||
// encode crossProduct into patternIndex
|
||||
int patternOrd = patternIndex << 1 | (crossProduct ? 1 : 0);
|
||||
affixWriter.writeShort((short) patternOrd);
|
||||
affixWriter.writeShort((short) appendFlagsOrd);
|
||||
affixData[dataStart + AFFIX_CONDITION] = (char) patternOrd;
|
||||
affixData[dataStart + AFFIX_APPEND] = (char) appendFlagsOrd;
|
||||
|
||||
if (needsInputCleaning) {
|
||||
CharSequence cleaned = cleanInput(affixArg, sb);
|
||||
|
@ -602,6 +605,10 @@ public class Dictionary {
|
|||
}
|
||||
}
|
||||
|
||||
char affixData(int affixIndex, int offset) {
|
||||
return affixData[affixIndex * 4 + offset];
|
||||
}
|
||||
|
||||
private FST<CharsRef> parseConversions(LineNumberReader reader, int num)
|
||||
throws IOException, ParseException {
|
||||
Map<String, String> mappings = new TreeMap<>();
|
||||
|
|
|
@ -21,7 +21,6 @@ import java.util.ArrayList;
|
|||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
|
@ -39,7 +38,6 @@ final class Stemmer {
|
|||
private final Dictionary dictionary;
|
||||
private final BytesRef scratch = new BytesRef();
|
||||
private final StringBuilder segment = new StringBuilder();
|
||||
private final ByteArrayDataInput affixReader;
|
||||
|
||||
// used for normalization
|
||||
private final StringBuilder scratchSegment = new StringBuilder();
|
||||
|
@ -56,7 +54,6 @@ final class Stemmer {
|
|||
*/
|
||||
public Stemmer(Dictionary dictionary) {
|
||||
this.dictionary = dictionary;
|
||||
this.affixReader = new ByteArrayDataInput(dictionary.affixData);
|
||||
for (int level = 0; level < 3; level++) {
|
||||
if (dictionary.prefixes != null) {
|
||||
prefixArcs[level] = new FST.Arc<>();
|
||||
|
@ -339,13 +336,10 @@ final class Stemmer {
|
|||
if (prefix == previous) {
|
||||
continue;
|
||||
}
|
||||
affixReader.setPosition(8 * prefix);
|
||||
char flag = (char) (affixReader.readShort() & 0xffff);
|
||||
char stripOrd = (char) (affixReader.readShort() & 0xffff);
|
||||
int condition = (char) (affixReader.readShort() & 0xffff);
|
||||
int condition = dictionary.affixData(prefix, Dictionary.AFFIX_CONDITION);
|
||||
boolean crossProduct = (condition & 1) == 1;
|
||||
condition >>>= 1;
|
||||
char append = (char) (affixReader.readShort() & 0xffff);
|
||||
int append = dictionary.affixData(prefix, Dictionary.AFFIX_APPEND);
|
||||
|
||||
final boolean compatible;
|
||||
if (recursionDepth == 0) {
|
||||
|
@ -373,6 +367,7 @@ final class Stemmer {
|
|||
if (compatible) {
|
||||
int deAffixedLength = length - i;
|
||||
|
||||
int stripOrd = dictionary.affixData(prefix, Dictionary.AFFIX_STRIP_ORD);
|
||||
int stripStart = dictionary.stripOffsets[stripOrd];
|
||||
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
|
||||
int stripLength = stripEnd - stripStart;
|
||||
|
@ -437,13 +432,10 @@ final class Stemmer {
|
|||
if (suffix == previous) {
|
||||
continue;
|
||||
}
|
||||
affixReader.setPosition(8 * suffix);
|
||||
char flag = (char) (affixReader.readShort() & 0xffff);
|
||||
char stripOrd = (char) (affixReader.readShort() & 0xffff);
|
||||
int condition = (char) (affixReader.readShort() & 0xffff);
|
||||
int condition = dictionary.affixData(suffix, Dictionary.AFFIX_CONDITION);
|
||||
boolean crossProduct = (condition & 1) == 1;
|
||||
condition >>>= 1;
|
||||
char append = (char) (affixReader.readShort() & 0xffff);
|
||||
int append = dictionary.affixData(suffix, Dictionary.AFFIX_APPEND);
|
||||
|
||||
final boolean compatible;
|
||||
if (recursionDepth == 0) {
|
||||
|
@ -473,6 +465,7 @@ final class Stemmer {
|
|||
int appendLength = length - i;
|
||||
int deAffixedLength = length - appendLength;
|
||||
|
||||
int stripOrd = dictionary.affixData(suffix, Dictionary.AFFIX_STRIP_ORD);
|
||||
int stripStart = dictionary.stripOffsets[stripOrd];
|
||||
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
|
||||
int stripLength = stripEnd - stripStart;
|
||||
|
@ -563,11 +556,9 @@ final class Stemmer {
|
|||
boolean caseVariant)
|
||||
throws IOException {
|
||||
// TODO: just pass this in from before, no need to decode it twice
|
||||
affixReader.setPosition(8 * affix);
|
||||
char flag = (char) (affixReader.readShort() & 0xffff);
|
||||
affixReader.skipBytes(2); // strip
|
||||
boolean crossProduct = ((int) (char) (affixReader.readShort() & 0xffff) & 1) == 1;
|
||||
char append = (char) (affixReader.readShort() & 0xffff);
|
||||
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
|
||||
boolean crossProduct = (dictionary.affixData(affix, Dictionary.AFFIX_CONDITION) & 1) == 1;
|
||||
char append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
|
||||
|
||||
List<CharsRef> stems = new ArrayList<>();
|
||||
|
||||
|
|
Loading…
Reference in New Issue