LUCENE-9664: Hunspell support: fix most IntelliJ warnings, cleanup (#2202)

This commit is contained in:
Peter Gromov 2021-01-15 13:52:34 +01:00 committed by GitHub
parent 90131a605a
commit 82f6f161ae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 125 additions and 150 deletions

View File

@ -140,8 +140,6 @@ public class Dictionary {
// when set, some words have exceptional stems, and the last entry is a pointer to stemExceptions
boolean hasStemExceptions;
private final Path tempPath = getDefaultTempDir(); // TODO: make this configurable?
boolean ignoreCase;
boolean complexPrefixes;
// if no affixes have continuation classes, no need to do 2-level affix stripping
@ -210,6 +208,7 @@ public class Dictionary {
this.needsOutputCleaning = false; // set if we have an OCONV
flagLookup.add(new BytesRef()); // no flags -> ord 0
Path tempPath = getDefaultTempDir(); // TODO: make this configurable?
Path aff = Files.createTempFile(tempPath, "affix", "aff");
OutputStream out = new BufferedOutputStream(Files.newOutputStream(aff));
InputStream aff1 = null;
@ -252,33 +251,33 @@ public class Dictionary {
}
/** Looks up Hunspell word forms from the dictionary */
IntsRef lookupWord(char word[], int offset, int length) {
IntsRef lookupWord(char[] word, int offset, int length) {
return lookup(words, word, offset, length);
}
// only for testing
IntsRef lookupPrefix(char word[], int offset, int length) {
return lookup(prefixes, word, offset, length);
IntsRef lookupPrefix(char[] word) {
return lookup(prefixes, word, 0, word.length);
}
// only for testing
IntsRef lookupSuffix(char word[], int offset, int length) {
return lookup(suffixes, word, offset, length);
IntsRef lookupSuffix(char[] word) {
return lookup(suffixes, word, 0, word.length);
}
IntsRef lookup(FST<IntsRef> fst, char word[], int offset, int length) {
IntsRef lookup(FST<IntsRef> fst, char[] word, int offset, int length) {
if (fst == null) {
return null;
}
final FST.BytesReader bytesReader = fst.getBytesReader();
final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>());
final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<>());
// Accumulate output as we go
final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
IntsRef output = NO_OUTPUT;
int l = offset + length;
try {
for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
for (int i = offset, cp; i < l; i += Character.charCount(cp)) {
cp = Character.codePointAt(word, i, l);
if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
return null;
@ -320,7 +319,7 @@ public class Dictionary {
seenStrips.put("", 0);
LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
String line = null;
String line;
while ((line = reader.readLine()) != null) {
// ignore any BOM marker on first line
if (reader.getLineNumber() == 1 && line.startsWith("\uFEFF")) {
@ -344,31 +343,31 @@ public class Dictionary {
complexPrefixes =
true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
} else if (line.startsWith(CIRCUMFIX_KEY)) {
String parts[] = line.split("\\s+");
String[] parts = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber());
}
circumfix = flagParsingStrategy.parseFlag(parts[1]);
} else if (line.startsWith(KEEPCASE_KEY)) {
String parts[] = line.split("\\s+");
String[] parts = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal KEEPCASE declaration", reader.getLineNumber());
}
keepcase = flagParsingStrategy.parseFlag(parts[1]);
} else if (line.startsWith(NEEDAFFIX_KEY) || line.startsWith(PSEUDOROOT_KEY)) {
String parts[] = line.split("\\s+");
String[] parts = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal NEEDAFFIX declaration", reader.getLineNumber());
}
needaffix = flagParsingStrategy.parseFlag(parts[1]);
} else if (line.startsWith(ONLYINCOMPOUND_KEY)) {
String parts[] = line.split("\\s+");
String[] parts = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal ONLYINCOMPOUND declaration", reader.getLineNumber());
}
onlyincompound = flagParsingStrategy.parseFlag(parts[1]);
} else if (line.startsWith(IGNORE_KEY)) {
String parts[] = line.split("\\s+");
String[] parts = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal IGNORE declaration", reader.getLineNumber());
}
@ -376,7 +375,7 @@ public class Dictionary {
Arrays.sort(ignore);
needsInputCleaning = true;
} else if (line.startsWith(ICONV_KEY) || line.startsWith(OCONV_KEY)) {
String parts[] = line.split("\\s+");
String[] parts = line.split("\\s+");
String type = parts[0];
if (parts.length != 2) {
throw new ParseException("Illegal " + type + " declaration", reader.getLineNumber());
@ -475,10 +474,10 @@ public class Dictionary {
BytesRefBuilder scratch = new BytesRefBuilder();
StringBuilder sb = new StringBuilder();
String args[] = header.split("\\s+");
String[] args = header.split("\\s+");
boolean crossProduct = args[2].equals("Y");
boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;
boolean isSuffix = conditionPattern.equals(SUFFIX_CONDITION_REGEX_PATTERN);
int numLines = Integer.parseInt(args[3]);
affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
@ -488,7 +487,7 @@ public class Dictionary {
for (int i = 0; i < numLines; i++) {
assert affixWriter.getPosition() == currentAffix << 3;
String line = reader.readLine();
String ruleArgs[] = line.split("\\s+");
String[] ruleArgs = line.split("\\s+");
// from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
// condition is optional
@ -501,7 +500,7 @@ public class Dictionary {
char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
String affixArg = ruleArgs[3];
char appendFlags[] = null;
char[] appendFlags = null;
// first: parse continuation classes out of affix
int flagSep = affixArg.lastIndexOf('/');
@ -585,7 +584,7 @@ public class Dictionary {
affixWriter.writeShort((short) flag);
affixWriter.writeShort((short) stripOrd.intValue());
// encode crossProduct into patternIndex
int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
int patternOrd = patternIndex << 1 | (crossProduct ? 1 : 0);
affixWriter.writeShort((short) patternOrd);
affixWriter.writeShort((short) appendFlagsOrd);
@ -598,12 +597,7 @@ public class Dictionary {
affixArg = new StringBuilder(affixArg).reverse().toString();
}
List<Integer> list = affixes.get(affixArg);
if (list == null) {
list = new ArrayList<>();
affixes.put(affixArg, list);
}
list.add(currentAffix);
affixes.computeIfAbsent(affixArg, __ -> new ArrayList<>()).add(currentAffix);
currentAffix++;
}
}
@ -614,7 +608,7 @@ public class Dictionary {
for (int i = 0; i < num; i++) {
String line = reader.readLine();
String parts[] = line.split("\\s+");
String[] parts = line.split("\\s+");
if (parts.length != 3) {
throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
}
@ -707,7 +701,7 @@ public class Dictionary {
* definition
*/
static FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
String parts[] = flagLine.split("\\s+");
String[] parts = flagLine.split("\\s+");
if (parts.length != 2) {
throw new IllegalArgumentException("Illegal FLAG specification: " + flagLine);
}
@ -724,11 +718,11 @@ public class Dictionary {
throw new IllegalArgumentException("Unknown flag type: " + flagType);
}
final char FLAG_SEPARATOR = 0x1f; // flag separator after escaping
final char MORPH_SEPARATOR =
private static final char FLAG_SEPARATOR = 0x1f; // flag separator after escaping
private static final char MORPH_SEPARATOR =
0x1e; // separator for boundary of entry (may be followed by morph data)
String unescapeEntry(String entry) {
private String unescapeEntry(String entry) {
StringBuilder sb = new StringBuilder();
int end = morphBoundary(entry);
for (int i = 0; i < end; i++) {
@ -738,9 +732,7 @@ public class Dictionary {
i++;
} else if (ch == '/') {
sb.append(FLAG_SEPARATOR);
} else if (ch == MORPH_SEPARATOR || ch == FLAG_SEPARATOR) {
// BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!!
} else {
} else if (!shouldSkipEscapedChar(ch)) {
sb.append(ch);
}
}
@ -748,9 +740,7 @@ public class Dictionary {
if (end < entry.length()) {
for (int i = end; i < entry.length(); i++) {
char c = entry.charAt(i);
if (c == FLAG_SEPARATOR || c == MORPH_SEPARATOR) {
// BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!!
} else {
if (!shouldSkipEscapedChar(c)) {
sb.append(c);
}
}
@ -758,6 +748,11 @@ public class Dictionary {
return sb.toString();
}
private static boolean shouldSkipEscapedChar(char ch) {
return ch == FLAG_SEPARATOR
|| ch == MORPH_SEPARATOR; // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!!
}
static int morphBoundary(String line) {
int end = indexOfSpaceOrTab(line, 0);
if (end == -1) {
@ -812,9 +807,9 @@ public class Dictionary {
try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
for (InputStream dictionary : dictionaries) {
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
String line =
lines.readLine(); // first line is number of entries (approximately, sometimes)
String line;
while ((line = lines.readLine()) != null) {
// wild and unpredictable code comment rules
if (line.isEmpty()
@ -825,7 +820,7 @@ public class Dictionary {
}
line = unescapeEntry(line);
// if we havent seen any stem exceptions, try to parse one
if (hasStemExceptions == false) {
if (!hasStemExceptions) {
int morphStart = line.indexOf(MORPH_SEPARATOR);
if (morphStart >= 0 && morphStart < line.length()) {
hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
@ -861,35 +856,28 @@ public class Dictionary {
new OfflineSorter(
tempDir,
tempFileNamePrefix,
new Comparator<BytesRef>() {
BytesRef scratch1 = new BytesRef();
BytesRef scratch2 = new BytesRef();
new Comparator<>() {
final BytesRef scratch1 = new BytesRef();
final BytesRef scratch2 = new BytesRef();
private void initScratch(BytesRef o, BytesRef scratch) {
scratch.bytes = o.bytes;
scratch.offset = o.offset;
scratch.length = o.length;
for (int i = scratch.length - 1; i >= 0; i--) {
if (scratch.bytes[scratch.offset + i] == FLAG_SEPARATOR
|| scratch.bytes[scratch.offset + i] == MORPH_SEPARATOR) {
scratch.length = i;
break;
}
}
}
@Override
public int compare(BytesRef o1, BytesRef o2) {
scratch1.bytes = o1.bytes;
scratch1.offset = o1.offset;
scratch1.length = o1.length;
for (int i = scratch1.length - 1; i >= 0; i--) {
if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR
|| scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
scratch1.length = i;
break;
}
}
scratch2.bytes = o2.bytes;
scratch2.offset = o2.offset;
scratch2.length = o2.length;
for (int i = scratch2.length - 1; i >= 0; i--) {
if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR
|| scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
scratch2.length = i;
break;
}
}
initScratch(o1, scratch1);
initScratch(o2, scratch2);
int cmp = scratch1.compareTo(scratch2);
if (cmp == 0) {
@ -933,7 +921,7 @@ public class Dictionary {
String line = scratch.utf8ToString();
String entry;
char wordForm[];
char[] wordForm;
int end;
int flagSep = line.indexOf(FLAG_SEPARATOR);
@ -980,7 +968,7 @@ public class Dictionary {
words.add(scratchInts.get(), currentOrds.get());
}
// swap current
if (cmp > 0 || currentEntry == null) {
if (cmp > 0) {
currentEntry = entry;
currentOrds = new IntsRefBuilder(); // must be this way
}
@ -994,6 +982,7 @@ public class Dictionary {
}
// finalize last entry
assert currentEntry != null;
Util.toUTF32(currentEntry, scratchInts);
words.add(scratchInts.get(), currentOrds.get());
success2 = true;
@ -1011,7 +1000,7 @@ public class Dictionary {
return CharsRef.EMPTY_CHARS;
}
int len = b.length >>> 1;
char flags[] = new char[len];
char[] flags = new char[len];
int upto = 0;
int end = b.offset + b.length;
for (int i = b.offset; i < end; i += 2) {
@ -1020,19 +1009,18 @@ public class Dictionary {
return flags;
}
static void encodeFlags(BytesRefBuilder b, char flags[]) {
private static void encodeFlags(BytesRefBuilder b, char[] flags) {
int len = flags.length << 1;
b.grow(len);
b.clear();
for (int i = 0; i < flags.length; i++) {
int flag = flags[i];
for (int flag : flags) {
b.append((byte) ((flag >> 8) & 0xff));
b.append((byte) (flag & 0xff));
}
}
private void parseAlias(String line) {
String ruleArgs[] = line.split("\\s+");
String[] ruleArgs = line.split("\\s+");
if (aliases == null) {
// first line should be the aliases count
final int count = Integer.parseInt(ruleArgs[1]);
@ -1102,7 +1090,7 @@ public class Dictionary {
* @return Parsed flag
*/
char parseFlag(String rawFlag) {
char flags[] = parseFlags(rawFlag);
char[] flags = parseFlags(rawFlag);
if (flags.length != 1) {
throw new IllegalArgumentException("expected only one flag, got: " + rawFlag);
}
@ -1140,9 +1128,9 @@ public class Dictionary {
char[] flags = new char[rawFlagParts.length];
int upto = 0;
for (int i = 0; i < rawFlagParts.length; i++) {
for (String rawFlagPart : rawFlagParts) {
// note, removing the trailing X/leading I for nepali... what is the rule here?!
String replacement = rawFlagParts[i].replaceAll("[^0-9]", "");
String replacement = rawFlagPart.replaceAll("[^0-9]", "");
// note, ignoring empty flags (this happens in danish, for example)
if (replacement.isEmpty()) {
continue;
@ -1185,13 +1173,13 @@ public class Dictionary {
builder.append(combined);
}
char flags[] = new char[builder.length()];
char[] flags = new char[builder.length()];
builder.getChars(0, builder.length(), flags, 0);
return flags;
}
}
static boolean hasFlag(char flags[], char flag) {
static boolean hasFlag(char[] flags, char flag) {
return Arrays.binarySearch(flags, flag) >= 0;
}
@ -1247,7 +1235,7 @@ public class Dictionary {
// TODO: this could be more efficient!
static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
final FST.BytesReader bytesReader = fst.getBytesReader();
final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<CharsRef>());
final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<>());
final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();
// temporary stuff
@ -1290,6 +1278,7 @@ public class Dictionary {
private static Path DEFAULT_TEMP_DIR;
/** Used by test framework */
@SuppressWarnings("unused")
public static void setDefaultTempDir(Path tempDir) {
DEFAULT_TEMP_DIR = tempDir;
}
@ -1306,7 +1295,7 @@ public class Dictionary {
throw new IOException("Java has no temporary folder property (java.io.tmpdir)?");
}
Path tempDirectory = Paths.get(tempDirPath);
if (Files.isWritable(tempDirectory) == false) {
if (!Files.isWritable(tempDirectory)) {
throw new IOException(
"Java's temporary folder not present or writeable?: " + tempDirectory.toAbsolutePath());
}

View File

@ -43,7 +43,7 @@ final class Stemmer {
// used for normalization
private final StringBuilder scratchSegment = new StringBuilder();
private char scratchBuffer[] = new char[32];
private char[] scratchBuffer = new char[32];
// it's '1' if we have no stem exceptions, otherwise every other form
// is really an ID pointing to the exception table
@ -86,7 +86,7 @@ final class Stemmer {
* @param word Word to find the stems for
* @return List of stems for the word
*/
public List<CharsRef> stem(char word[], int length) {
public List<CharsRef> stem(char[] word, int length) {
if (dictionary.needsInputCleaning) {
scratchSegment.setLength(0);
@ -128,7 +128,7 @@ final class Stemmer {
private static final int UPPER_CASE = 2;
/** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
private int caseOf(char word[], int length) {
private int caseOf(char[] word, int length) {
if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) {
return EXACT_CASE;
}
@ -152,7 +152,7 @@ final class Stemmer {
}
/** folds titlecase variant of word to titleBuffer */
private void caseFoldTitle(char word[], int length) {
private void caseFoldTitle(char[] word, int length) {
titleBuffer = ArrayUtil.grow(titleBuffer, length);
System.arraycopy(word, 0, titleBuffer, 0, length);
for (int i = 1; i < length; i++) {
@ -161,13 +161,13 @@ final class Stemmer {
}
/** folds lowercase variant of word (title cased) to lowerBuffer */
private void caseFoldLower(char word[], int length) {
private void caseFoldLower(char[] word, int length) {
lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
System.arraycopy(word, 0, lowerBuffer, 0, length);
lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
}
private List<CharsRef> doStem(char word[], int length, boolean caseVariant) {
private List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
List<CharsRef> stems = new ArrayList<>();
IntsRef forms = dictionary.lookupWord(word, 0, length);
if (forms != null) {
@ -177,7 +177,7 @@ final class Stemmer {
boolean checkOnlyInCompound = dictionary.onlyincompound != -1;
if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) {
dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
char wordFlags[] = Dictionary.decodeFlags(scratch);
char[] wordFlags = Dictionary.decodeFlags(scratch);
// we are looking for a case variant, but this word does not allow it
if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) {
continue;
@ -196,7 +196,6 @@ final class Stemmer {
}
}
try {
boolean v =
stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant));
} catch (IOException bogus) {
throw new RuntimeException(bogus);
@ -210,7 +209,7 @@ final class Stemmer {
* @param word Word to find the stems for
* @return List of stems for the word
*/
public List<CharsRef> uniqueStems(char word[], int length) {
public List<CharsRef> uniqueStems(char[] word, int length) {
List<CharsRef> stems = stem(word, length);
if (stems.size() < 2) {
return stems;
@ -226,7 +225,7 @@ final class Stemmer {
return deduped;
}
private CharsRef newStem(char buffer[], int length, IntsRef forms, int formID) {
private CharsRef newStem(char[] buffer, int length, IntsRef forms, int formID) {
final String exception;
if (dictionary.hasStemExceptions) {
int exceptionID = forms.ints[forms.offset + formID + 1];
@ -251,7 +250,7 @@ final class Stemmer {
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
char cleaned[] = new char[scratchSegment.length()];
char[] cleaned = new char[scratchSegment.length()];
scratchSegment.getChars(0, cleaned.length, cleaned, 0);
return new CharsRef(cleaned, 0, cleaned.length);
} else {
@ -264,15 +263,15 @@ final class Stemmer {
}
// some state for traversing FSTs
final FST.BytesReader prefixReaders[] = new FST.BytesReader[3];
private final FST.BytesReader[] prefixReaders = new FST.BytesReader[3];
@SuppressWarnings({"unchecked", "rawtypes"})
final FST.Arc<IntsRef> prefixArcs[] = new FST.Arc[3];
private final FST.Arc<IntsRef>[] prefixArcs = new FST.Arc[3];
final FST.BytesReader suffixReaders[] = new FST.BytesReader[3];
private final FST.BytesReader[] suffixReaders = new FST.BytesReader[3];
@SuppressWarnings({"unchecked", "rawtypes"})
final FST.Arc<IntsRef> suffixArcs[] = new FST.Arc[3];
private final FST.Arc<IntsRef>[] suffixArcs = new FST.Arc[3];
/**
* Generates a list of stems for the provided word
@ -296,7 +295,7 @@ final class Stemmer {
* @return List of stems, or empty list if no stems are found
*/
private List<CharsRef> stem(
char word[],
char[] word,
int length,
int previous,
int prevFlag,
@ -330,12 +329,10 @@ final class Stemmer {
output = fst.outputs.add(output, arc.output());
}
}
IntsRef prefixes = null;
if (!arc.isFinal()) {
continue;
} else {
prefixes = fst.outputs.add(output, arc.nextFinalOutput());
}
IntsRef prefixes = fst.outputs.add(output, arc.nextFinalOutput());
for (int j = 0; j < prefixes.length; j++) {
int prefix = prefixes.ints[prefixes.offset + j];
@ -357,13 +354,13 @@ final class Stemmer {
} else {
// check if affix is allowed in a non-compound word
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
}
} else if (crossProduct) {
// cross check incoming continuation class (flag of previous affix) against list.
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
assert prevFlag >= 0;
boolean allowed =
dictionary.onlyincompound == -1
@ -374,8 +371,7 @@ final class Stemmer {
}
if (compatible) {
int deAffixedStart = i;
int deAffixedLength = length - deAffixedStart;
int deAffixedLength = length - i;
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
@ -387,14 +383,14 @@ final class Stemmer {
stripStart,
stripLength,
word,
deAffixedStart,
i,
deAffixedLength)) {
continue;
}
char strippedWord[] = new char[stripLength + deAffixedLength];
char[] strippedWord = new char[stripLength + deAffixedLength];
System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);
System.arraycopy(word, i, strippedWord, stripLength, deAffixedLength);
List<CharsRef> stemList =
applyAffix(
@ -431,12 +427,10 @@ final class Stemmer {
output = fst.outputs.add(output, arc.output());
}
}
IntsRef suffixes = null;
if (!arc.isFinal()) {
continue;
} else {
suffixes = fst.outputs.add(output, arc.nextFinalOutput());
}
IntsRef suffixes = fst.outputs.add(output, arc.nextFinalOutput());
for (int j = 0; j < suffixes.length; j++) {
int suffix = suffixes.ints[suffixes.offset + j];
@ -458,13 +452,13 @@ final class Stemmer {
} else {
// check if affix is allowed in a non-compound word
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
}
} else if (crossProduct) {
// cross check incoming continuation class (flag of previous affix) against list.
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
assert prevFlag >= 0;
boolean allowed =
dictionary.onlyincompound == -1
@ -494,7 +488,7 @@ final class Stemmer {
continue;
}
char strippedWord[] = new char[stripLength + deAffixedLength];
char[] strippedWord = new char[stripLength + deAffixedLength];
System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
System.arraycopy(
dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
@ -524,7 +518,7 @@ final class Stemmer {
// just check the stem
// but this is a little bit more complicated.
private boolean checkCondition(
int condition, char c1[], int c1off, int c1len, char c2[], int c2off, int c2len) {
int condition, char[] c1, int c1off, int c1len, char[] c2, int c2off, int c2len) {
if (condition != 0) {
CharacterRunAutomaton pattern = dictionary.patterns.get(condition);
int state = 0;
@ -559,7 +553,7 @@ final class Stemmer {
* @return List of stems for the word, or an empty list if none are found
*/
List<CharsRef> applyAffix(
char strippedWord[],
char[] strippedWord,
int length,
int affix,
int prefixFlag,
@ -572,9 +566,7 @@ final class Stemmer {
affixReader.setPosition(8 * affix);
char flag = (char) (affixReader.readShort() & 0xffff);
affixReader.skipBytes(2); // strip
int condition = (char) (affixReader.readShort() & 0xffff);
boolean crossProduct = (condition & 1) == 1;
condition >>>= 1;
boolean crossProduct = ((int) (char) (affixReader.readShort() & 0xffff) & 1) == 1;
char append = (char) (affixReader.readShort() & 0xffff);
List<CharsRef> stems = new ArrayList<>();
@ -583,18 +575,18 @@ final class Stemmer {
if (forms != null) {
for (int i = 0; i < forms.length; i += formStep) {
dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
char wordFlags[] = Dictionary.decodeFlags(scratch);
char[] wordFlags = Dictionary.decodeFlags(scratch);
if (Dictionary.hasFlag(wordFlags, flag)) {
// confusing: in this one exception, we already chained the first prefix against the
// second,
// so it doesnt need to be checked against the word
boolean chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
if (chainedPrefix == false
if (!chainedPrefix
&& prefixFlag >= 0
&& !Dictionary.hasFlag(wordFlags, (char) prefixFlag)) {
// see if we can chain prefix thru the suffix continuation class (only if it has any!)
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
if (!hasCrossCheckedFlag((char) prefixFlag, appendFlags, false)) {
continue;
}
@ -604,7 +596,7 @@ final class Stemmer {
// to ensure it has it, and vice versa
if (dictionary.circumfix != -1) {
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
boolean suffixCircumfix = Dictionary.hasFlag(appendFlags, (char) dictionary.circumfix);
if (circumfix != suffixCircumfix) {
continue;
@ -631,7 +623,7 @@ final class Stemmer {
// have that flag
if (dictionary.circumfix != -1 && !circumfix && prefix) {
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
circumfix = Dictionary.hasFlag(appendFlags, (char) dictionary.circumfix);
}
@ -654,7 +646,7 @@ final class Stemmer {
true,
circumfix,
caseVariant));
} else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
} else if (!dictionary.complexPrefixes && dictionary.twoStageAffix) {
// we took away a suffix.
// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
// COMPLEXPREFIXES = false: combine with another suffix
@ -688,9 +680,7 @@ final class Stemmer {
true,
circumfix,
caseVariant));
} else if (prefix == false
&& dictionary.complexPrefixes == false
&& dictionary.twoStageAffix) {
} else if (!prefix && !dictionary.complexPrefixes && dictionary.twoStageAffix) {
// we took away a prefix, then a suffix: go look for another suffix
stems.addAll(
stem(

View File

@ -42,18 +42,19 @@ public class TestDictionary extends LuceneTestCase {
Directory tempDir = getDirectory();
Dictionary dictionary = new Dictionary(tempDir, "dictionary", affixStream, dictStream);
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}, 0, 1).length);
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}, 0, 1).length);
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length);
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}).length);
IntsRef ordList = dictionary.lookupWord(new char[] {'o', 'l', 'r'}, 0, 3);
assertNotNull(ordList);
assertEquals(1, ordList.length);
BytesRef ref = new BytesRef();
dictionary.flagLookup.get(ordList.ints[0], ref);
char flags[] = Dictionary.decodeFlags(ref);
char[] flags = Dictionary.decodeFlags(ref);
assertEquals(1, flags.length);
ordList = dictionary.lookupWord(new char[] {'l', 'u', 'c', 'e', 'n'}, 0, 5);
int offset = random().nextInt(10);
ordList = dictionary.lookupWord((" ".repeat(offset) + "lucen").toCharArray(), offset, 5);
assertNotNull(ordList);
assertEquals(1, ordList.length);
dictionary.flagLookup.get(ordList.ints[0], ref);
@ -71,12 +72,12 @@ public class TestDictionary extends LuceneTestCase {
Directory tempDir = getDirectory();
Dictionary dictionary = new Dictionary(tempDir, "dictionary", affixStream, dictStream);
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}, 0, 1).length);
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}, 0, 1).length);
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length);
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}).length);
IntsRef ordList = dictionary.lookupWord(new char[] {'o', 'l', 'r'}, 0, 3);
BytesRef ref = new BytesRef();
dictionary.flagLookup.get(ordList.ints[0], ref);
char flags[] = Dictionary.decodeFlags(ref);
char[] flags = Dictionary.decodeFlags(ref);
assertEquals(1, flags.length);
affixStream.close();
@ -90,12 +91,12 @@ public class TestDictionary extends LuceneTestCase {
Directory tempDir = getDirectory();
Dictionary dictionary = new Dictionary(tempDir, "dictionary", affixStream, dictStream);
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}, 0, 1).length);
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}, 0, 1).length);
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length);
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}).length);
IntsRef ordList = dictionary.lookupWord(new char[] {'o', 'l', 'r'}, 0, 3);
BytesRef ref = new BytesRef();
dictionary.flagLookup.get(ordList.ints[0], ref);
char flags[] = Dictionary.decodeFlags(ref);
char[] flags = Dictionary.decodeFlags(ref);
assertEquals(1, flags.length);
affixStream.close();
@ -109,12 +110,12 @@ public class TestDictionary extends LuceneTestCase {
Directory tempDir = getDirectory();
Dictionary dictionary = new Dictionary(tempDir, "dictionary", affixStream, dictStream);
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}, 0, 1).length);
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}, 0, 1).length);
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length);
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}).length);
IntsRef ordList = dictionary.lookupWord(new char[] {'o', 'l', 'r'}, 0, 3);
BytesRef ref = new BytesRef();
dictionary.flagLookup.get(ordList.ints[0], ref);
char flags[] = Dictionary.decodeFlags(ref);
char[] flags = Dictionary.decodeFlags(ref);
assertEquals(1, flags.length);
affixStream.close();
@ -131,9 +132,7 @@ public class TestDictionary extends LuceneTestCase {
ParseException expected =
expectThrows(
ParseException.class,
() -> {
new Dictionary(tempDir, "dictionary", affixStream, dictStream);
});
() -> new Dictionary(tempDir, "dictionary", affixStream, dictStream));
assertTrue(
expected
.getMessage()
@ -153,10 +152,7 @@ public class TestDictionary extends LuceneTestCase {
Exception expected =
expectThrows(
Exception.class,
() -> {
new Dictionary(tempDir, "dictionary", affixStream, dictStream);
});
Exception.class, () -> new Dictionary(tempDir, "dictionary", affixStream, dictStream));
assertTrue(expected.getMessage().startsWith("expected only one flag"));
affixStream.close();
@ -272,7 +268,7 @@ public class TestDictionary extends LuceneTestCase {
Dictionary.getDictionaryEncoding(new ByteArrayInputStream(new byte[0])));
}
public void testFlagWithCrazyWhitespace() throws Exception {
public void testFlagWithCrazyWhitespace() {
assertNotNull(Dictionary.getFlagParsingStrategy("FLAG\tUTF-8"));
assertNotNull(Dictionary.getFlagParsingStrategy("FLAG UTF-8"));
}