LUCENE-9677: simplify Dictionary.affixData storage (#2218)

Use char[] instead of byte[], get rid of unnecessary byte array readers/writers.
This commit is contained in:
Peter Gromov 2021-01-19 09:22:33 +01:00 committed by GitHub
parent ab08fdc6f0
commit 426c902bc9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 27 additions and 29 deletions

View File

@ -45,7 +45,6 @@ import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
@ -118,10 +117,16 @@ public class Dictionary {
char[] stripData;
int[] stripOffsets;
// 8 bytes per affix
byte[] affixData = new byte[64];
// 4 chars per affix, each char representing an unsigned 2-byte integer
char[] affixData = new char[32];
private int currentAffix = 0;
// offsets in affixData
static final int AFFIX_FLAG = 0;
static final int AFFIX_STRIP_ORD = 1;
static final int AFFIX_CONDITION = 2;
static final int AFFIX_APPEND = 3;
// Default flag parsing strategy
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy();
@ -480,12 +485,9 @@ public class Dictionary {
boolean isSuffix = conditionPattern.equals(SUFFIX_CONDITION_REGEX_PATTERN);
int numLines = Integer.parseInt(args[3]);
affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
ByteArrayDataOutput affixWriter =
new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);
affixData = ArrayUtil.grow(affixData, currentAffix * 4 + numLines * 4);
for (int i = 0; i < numLines; i++) {
assert affixWriter.getPosition() == currentAffix << 3;
String line = reader.readLine();
String[] ruleArgs = line.split("\\s+");
@ -581,12 +583,13 @@ public class Dictionary {
"Too many unique append flags, please report this to dev@lucene.apache.org");
}
affixWriter.writeShort((short) flag);
affixWriter.writeShort((short) stripOrd.intValue());
int dataStart = currentAffix * 4;
affixData[dataStart + AFFIX_FLAG] = flag;
affixData[dataStart + AFFIX_STRIP_ORD] = (char) stripOrd.intValue();
// encode crossProduct into patternIndex
int patternOrd = patternIndex << 1 | (crossProduct ? 1 : 0);
affixWriter.writeShort((short) patternOrd);
affixWriter.writeShort((short) appendFlagsOrd);
affixData[dataStart + AFFIX_CONDITION] = (char) patternOrd;
affixData[dataStart + AFFIX_APPEND] = (char) appendFlagsOrd;
if (needsInputCleaning) {
CharSequence cleaned = cleanInput(affixArg, sb);
@ -602,6 +605,10 @@ public class Dictionary {
}
}
char affixData(int affixIndex, int offset) {
return affixData[affixIndex * 4 + offset];
}
private FST<CharsRef> parseConversions(LineNumberReader reader, int num)
throws IOException, ParseException {
Map<String, String> mappings = new TreeMap<>();

View File

@ -21,7 +21,6 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
@ -39,7 +38,6 @@ final class Stemmer {
private final Dictionary dictionary;
private final BytesRef scratch = new BytesRef();
private final StringBuilder segment = new StringBuilder();
private final ByteArrayDataInput affixReader;
// used for normalization
private final StringBuilder scratchSegment = new StringBuilder();
@ -56,7 +54,6 @@ final class Stemmer {
*/
public Stemmer(Dictionary dictionary) {
this.dictionary = dictionary;
this.affixReader = new ByteArrayDataInput(dictionary.affixData);
for (int level = 0; level < 3; level++) {
if (dictionary.prefixes != null) {
prefixArcs[level] = new FST.Arc<>();
@ -339,13 +336,10 @@ final class Stemmer {
if (prefix == previous) {
continue;
}
affixReader.setPosition(8 * prefix);
char flag = (char) (affixReader.readShort() & 0xffff);
char stripOrd = (char) (affixReader.readShort() & 0xffff);
int condition = (char) (affixReader.readShort() & 0xffff);
int condition = dictionary.affixData(prefix, Dictionary.AFFIX_CONDITION);
boolean crossProduct = (condition & 1) == 1;
condition >>>= 1;
char append = (char) (affixReader.readShort() & 0xffff);
int append = dictionary.affixData(prefix, Dictionary.AFFIX_APPEND);
final boolean compatible;
if (recursionDepth == 0) {
@ -373,6 +367,7 @@ final class Stemmer {
if (compatible) {
int deAffixedLength = length - i;
int stripOrd = dictionary.affixData(prefix, Dictionary.AFFIX_STRIP_ORD);
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
int stripLength = stripEnd - stripStart;
@ -437,13 +432,10 @@ final class Stemmer {
if (suffix == previous) {
continue;
}
affixReader.setPosition(8 * suffix);
char flag = (char) (affixReader.readShort() & 0xffff);
char stripOrd = (char) (affixReader.readShort() & 0xffff);
int condition = (char) (affixReader.readShort() & 0xffff);
int condition = dictionary.affixData(suffix, Dictionary.AFFIX_CONDITION);
boolean crossProduct = (condition & 1) == 1;
condition >>>= 1;
char append = (char) (affixReader.readShort() & 0xffff);
int append = dictionary.affixData(suffix, Dictionary.AFFIX_APPEND);
final boolean compatible;
if (recursionDepth == 0) {
@ -473,6 +465,7 @@ final class Stemmer {
int appendLength = length - i;
int deAffixedLength = length - appendLength;
int stripOrd = dictionary.affixData(suffix, Dictionary.AFFIX_STRIP_ORD);
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
int stripLength = stripEnd - stripStart;
@ -563,11 +556,9 @@ final class Stemmer {
boolean caseVariant)
throws IOException {
// TODO: just pass this in from before, no need to decode it twice
affixReader.setPosition(8 * affix);
char flag = (char) (affixReader.readShort() & 0xffff);
affixReader.skipBytes(2); // strip
boolean crossProduct = ((int) (char) (affixReader.readShort() & 0xffff) & 1) == 1;
char append = (char) (affixReader.readShort() & 0xffff);
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
boolean crossProduct = (dictionary.affixData(affix, Dictionary.AFFIX_CONDITION) & 1) == 1;
char append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
List<CharsRef> stems = new ArrayList<>();