LUCENE-9693: Hunspell: check that all flags are > 0 and fit char range (#2238)

This commit is contained in:
Peter Gromov 2021-01-26 09:29:47 +01:00 committed by GitHub
parent 0d88c14837
commit a82634db9d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 33 additions and 39 deletions

View File

@ -76,6 +76,8 @@ public class Dictionary {
static final char[] NOFLAGS = new char[0];
static final int FLAG_UNSET = 0;
private static final int DEFAULT_FLAGS = 65510;
private static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell
// TODO: really for suffixes we should reverse the automaton and run them backwards
@ -135,11 +137,11 @@ public class Dictionary {
// if no affixes have continuation classes, no need to do 2-level affix stripping
boolean twoStageAffix;
int circumfix = -1; // circumfix flag, or -1 if one is not defined
int keepcase = -1; // keepcase flag, or -1 if one is not defined
int needaffix = -1; // needaffix flag, or -1 if one is not defined
int forbiddenword = -1; // forbiddenword flag, or -1 if one is not defined
int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined
char circumfix;
char keepcase;
char needaffix;
char forbiddenword;
char onlyincompound;
int compoundMin = 3;
List<CompoundRule> compoundRules; // nullable
@ -1161,9 +1163,9 @@ public class Dictionary {
}
boolean isForbiddenWord(char[] word, int length, BytesRef scratch) {
if (forbiddenword != -1) {
if (forbiddenword != FLAG_UNSET) {
IntsRef forms = lookupWord(word, 0, length);
return forms != null && hasFlag(forms, (char) forbiddenword, scratch);
return forms != null && hasFlag(forms, forbiddenword, scratch);
}
return false;
}
@ -1240,7 +1242,12 @@ public class Dictionary {
if (replacement.isEmpty()) {
continue;
}
flags[upto++] = (char) Integer.parseInt(replacement);
int flag = Integer.parseInt(replacement);
if (flag == FLAG_UNSET || flag >= Character.MAX_VALUE) { // read default flags as well
throw new IllegalArgumentException(
"Num flags should be between 0 and " + DEFAULT_FLAGS + ", found " + flag);
}
flags[upto++] = (char) flag;
}
if (upto < flags.length) {
@ -1251,10 +1258,8 @@ public class Dictionary {
@Override
void appendFlag(char flag, StringBuilder to) {
if (to.length() > 0) {
to.append(",");
}
to.append((int) flag);
to.append(",");
}
}
@ -1303,11 +1308,11 @@ public class Dictionary {
}
boolean hasFlag(int entryId, char flag, BytesRef scratch) {
return hasFlag(decodeFlags(entryId, scratch), flag);
return flag != FLAG_UNSET && hasFlag(decodeFlags(entryId, scratch), flag);
}
static boolean hasFlag(char[] flags, char flag) {
return Arrays.binarySearch(flags, flag) >= 0;
return flag != FLAG_UNSET && Arrays.binarySearch(flags, flag) >= 0;
}
CharSequence cleanInput(CharSequence input, StringBuilder reuse) {

View File

@ -168,20 +168,18 @@ final class Stemmer {
continue;
}
// we can't add this form, it's a pseudostem requiring an affix
if (dictionary.needaffix != -1
&& Dictionary.hasFlag(wordFlags, (char) dictionary.needaffix)) {
if (Dictionary.hasFlag(wordFlags, dictionary.needaffix)) {
continue;
}
// we can't add this form, it only belongs inside a compound word
if (dictionary.onlyincompound != -1
&& Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
if (Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
continue;
}
stems.add(newStem(word, length, forms, i));
}
}
try {
stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant));
stems.addAll(stem(word, length, -1, (char) 0, -1, 0, true, true, false, false, caseVariant));
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
@ -190,7 +188,7 @@ final class Stemmer {
private boolean acceptCase(boolean caseVariant, char[] wordFlags) {
return caseVariant
? dictionary.keepcase == -1 || !Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)
? !Dictionary.hasFlag(wordFlags, dictionary.keepcase)
: !Dictionary.hasHiddenFlag(wordFlags);
}
@ -289,7 +287,7 @@ final class Stemmer {
char[] word,
int length,
int previous,
int prevFlag,
char prevFlag,
int prefixId,
int recursionDepth,
boolean doPrefix,
@ -428,27 +426,19 @@ final class Stemmer {
}
private boolean isAffixCompatible(
int affix, int prevFlag, int recursionDepth, boolean previousWasPrefix) {
int affix, char prevFlag, int recursionDepth, boolean previousWasPrefix) {
int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
if (recursionDepth == 0) {
if (dictionary.onlyincompound == -1) {
return true;
}
// check if affix is allowed in a non-compound word
return !dictionary.hasFlag(append, (char) dictionary.onlyincompound, scratch);
return !dictionary.hasFlag(append, dictionary.onlyincompound, scratch);
}
if (isCrossProduct(affix)) {
// cross check incoming continuation class (flag of previous affix) against list.
char[] appendFlags = dictionary.decodeFlags(append, scratch);
assert prevFlag >= 0;
boolean allowed =
dictionary.onlyincompound == -1
|| !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
if (allowed) {
return previousWasPrefix || Dictionary.hasFlag(appendFlags, (char) prevFlag);
if (!Dictionary.hasFlag(appendFlags, dictionary.onlyincompound)) {
return previousWasPrefix || Dictionary.hasFlag(appendFlags, prevFlag);
}
}
@ -528,8 +518,8 @@ final class Stemmer {
// if circumfix was previously set by a prefix, we must check this suffix,
// to ensure it has it, and vice versa
if (dictionary.circumfix != -1) {
boolean suffixCircumfix = isFlagAppendedByAffix(affix, (char) dictionary.circumfix);
if (dictionary.circumfix != Dictionary.FLAG_UNSET) {
boolean suffixCircumfix = isFlagAppendedByAffix(affix, dictionary.circumfix);
if (circumfix != suffixCircumfix) {
continue;
}
@ -540,8 +530,7 @@ final class Stemmer {
continue;
}
// we aren't decompounding (yet)
if (dictionary.onlyincompound != -1
&& Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
if (Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
continue;
}
stems.add(newStem(strippedWord, length, forms, i));
@ -551,8 +540,8 @@ final class Stemmer {
// if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we
// have that flag
if (dictionary.circumfix != -1 && !circumfix && prefix) {
circumfix = isFlagAppendedByAffix(affix, (char) dictionary.circumfix);
if (dictionary.circumfix != Dictionary.FLAG_UNSET && !circumfix && prefix) {
circumfix = isFlagAppendedByAffix(affix, dictionary.circumfix);
}
if (isCrossProduct(affix) && recursionDepth <= 1) {
@ -602,7 +591,7 @@ final class Stemmer {
}
private boolean isFlagAppendedByAffix(int affixId, char flag) {
if (affixId < 0) return false;
if (affixId < 0 || flag == Dictionary.FLAG_UNSET) return false;
int appendId = dictionary.affixData(affixId, Dictionary.AFFIX_APPEND);
return dictionary.hasFlag(appendId, flag, scratch);
}