mirror of https://github.com/apache/lucene.git
LUCENE-9685: Hunspell: prefix condition is only checked on suffix, not stem (#2226)
This commit is contained in:
parent
fdf04d8c63
commit
37e31f2ac7
|
@ -308,32 +308,8 @@ final class Stemmer {
|
|||
if (prefix == previous) {
|
||||
continue;
|
||||
}
|
||||
int condition = dictionary.affixData(prefix, Dictionary.AFFIX_CONDITION);
|
||||
boolean crossProduct = (condition & 1) == 1;
|
||||
condition >>>= 1;
|
||||
int append = dictionary.affixData(prefix, Dictionary.AFFIX_APPEND);
|
||||
|
||||
final boolean compatible;
|
||||
if (recursionDepth == 0) {
|
||||
if (dictionary.onlyincompound == -1) {
|
||||
compatible = true;
|
||||
} else {
|
||||
// check if affix is allowed in a non-compound word
|
||||
compatible = !dictionary.hasFlag(append, (char) dictionary.onlyincompound, scratch);
|
||||
}
|
||||
} else if (crossProduct) {
|
||||
// cross check incoming continuation class (flag of previous affix) against list.
|
||||
char[] appendFlags = dictionary.decodeFlags(append, scratch);
|
||||
assert prevFlag >= 0;
|
||||
boolean allowed =
|
||||
dictionary.onlyincompound == -1
|
||||
|| !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
|
||||
compatible = allowed && Dictionary.hasFlag(appendFlags, (char) prevFlag);
|
||||
} else {
|
||||
compatible = false;
|
||||
}
|
||||
|
||||
if (compatible) {
|
||||
if (isAffixCompatible(prefix, prevFlag, recursionDepth, false)) {
|
||||
int deAffixedLength = length - i;
|
||||
|
||||
int stripOrd = dictionary.affixData(prefix, Dictionary.AFFIX_STRIP_ORD);
|
||||
|
@ -342,13 +318,7 @@ final class Stemmer {
|
|||
int stripLength = stripEnd - stripStart;
|
||||
|
||||
if (!checkCondition(
|
||||
condition,
|
||||
dictionary.stripData,
|
||||
stripStart,
|
||||
stripLength,
|
||||
word,
|
||||
i,
|
||||
deAffixedLength)) {
|
||||
prefix, dictionary.stripData, stripStart, stripLength, word, i, deAffixedLength)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -401,35 +371,8 @@ final class Stemmer {
|
|||
if (suffix == previous) {
|
||||
continue;
|
||||
}
|
||||
int condition = dictionary.affixData(suffix, Dictionary.AFFIX_CONDITION);
|
||||
boolean crossProduct = (condition & 1) == 1;
|
||||
condition >>>= 1;
|
||||
int append = dictionary.affixData(suffix, Dictionary.AFFIX_APPEND);
|
||||
|
||||
final boolean compatible;
|
||||
if (recursionDepth == 0) {
|
||||
if (dictionary.onlyincompound == -1) {
|
||||
compatible = true;
|
||||
} else {
|
||||
// check if affix is allowed in a non-compound word
|
||||
compatible = !dictionary.hasFlag(append, (char) dictionary.onlyincompound, scratch);
|
||||
}
|
||||
} else if (crossProduct) {
|
||||
// cross check incoming continuation class (flag of previous affix) against list.
|
||||
char[] appendFlags = dictionary.decodeFlags(append, scratch);
|
||||
assert prevFlag >= 0;
|
||||
boolean allowed =
|
||||
dictionary.onlyincompound == -1
|
||||
|| !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
|
||||
compatible =
|
||||
allowed
|
||||
&& ((appendFlags.length == 0 && previousWasPrefix)
|
||||
|| Dictionary.hasFlag(appendFlags, (char) prevFlag));
|
||||
} else {
|
||||
compatible = false;
|
||||
}
|
||||
|
||||
if (compatible) {
|
||||
if (isAffixCompatible(suffix, prevFlag, recursionDepth, previousWasPrefix)) {
|
||||
int appendLength = length - i;
|
||||
int deAffixedLength = length - appendLength;
|
||||
|
||||
|
@ -439,13 +382,7 @@ final class Stemmer {
|
|||
int stripLength = stripEnd - stripStart;
|
||||
|
||||
if (!checkCondition(
|
||||
condition,
|
||||
word,
|
||||
0,
|
||||
deAffixedLength,
|
||||
dictionary.stripData,
|
||||
stripStart,
|
||||
stripLength)) {
|
||||
suffix, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -474,12 +411,41 @@ final class Stemmer {
|
|||
return stems;
|
||||
}
|
||||
|
||||
private boolean isAffixCompatible(
|
||||
int affix, int prevFlag, int recursionDepth, boolean previousWasPrefix) {
|
||||
int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
|
||||
|
||||
if (recursionDepth == 0) {
|
||||
if (dictionary.onlyincompound == -1) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// check if affix is allowed in a non-compound word
|
||||
return !dictionary.hasFlag(append, (char) dictionary.onlyincompound, scratch);
|
||||
}
|
||||
|
||||
if (isCrossProduct(affix)) {
|
||||
// cross check incoming continuation class (flag of previous affix) against list.
|
||||
char[] appendFlags = dictionary.decodeFlags(append, scratch);
|
||||
assert prevFlag >= 0;
|
||||
boolean allowed =
|
||||
dictionary.onlyincompound == -1
|
||||
|| !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
|
||||
if (allowed) {
|
||||
return previousWasPrefix || Dictionary.hasFlag(appendFlags, (char) prevFlag);
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/** checks condition of the concatenation of two strings */
|
||||
// note: this is pretty stupid, we really should subtract strip from the condition up front and
|
||||
// just check the stem
|
||||
// but this is a little bit more complicated.
|
||||
private boolean checkCondition(
|
||||
int condition, char[] c1, int c1off, int c1len, char[] c2, int c2off, int c2len) {
|
||||
int affix, char[] c1, int c1off, int c1len, char[] c2, int c2off, int c2len) {
|
||||
int condition = dictionary.affixData(affix, Dictionary.AFFIX_CONDITION) >>> 1;
|
||||
if (condition != 0) {
|
||||
CharacterRunAutomaton pattern = dictionary.patterns.get(condition);
|
||||
int state = 0;
|
||||
|
@ -523,9 +489,7 @@ final class Stemmer {
|
|||
boolean circumfix,
|
||||
boolean caseVariant)
|
||||
throws IOException {
|
||||
// TODO: just pass this in from before, no need to decode it twice
|
||||
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
|
||||
boolean crossProduct = (dictionary.affixData(affix, Dictionary.AFFIX_CONDITION) & 1) == 1;
|
||||
char append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
|
||||
|
||||
List<CharsRef> stems = new ArrayList<>();
|
||||
|
@ -578,7 +542,7 @@ final class Stemmer {
|
|||
circumfix = dictionary.hasFlag(append, (char) dictionary.circumfix, scratch);
|
||||
}
|
||||
|
||||
if (crossProduct && recursionDepth <= 1) {
|
||||
if (isCrossProduct(affix) && recursionDepth <= 1) {
|
||||
boolean doPrefix;
|
||||
if (recursionDepth == 0) {
|
||||
if (prefix) {
|
||||
|
@ -623,4 +587,8 @@ final class Stemmer {
|
|||
|
||||
return stems;
|
||||
}
|
||||
|
||||
private boolean isCrossProduct(int affix) {
|
||||
return (dictionary.affixData(affix, Dictionary.AFFIX_CONDITION) & 1) == 1;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,5 +34,9 @@ public class TestDependencies extends StemmerTestBase {
|
|||
assertStemsTo("undrinkables", "drink");
|
||||
assertStemsTo("undrink");
|
||||
assertStemsTo("undrinks");
|
||||
|
||||
assertStemsTo("hydration", "hydrate");
|
||||
assertStemsTo("dehydrate", "hydrate");
|
||||
assertStemsTo("dehydration", "hydrate");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,3 +11,9 @@ SFX Q 0 s . +3SGV
|
|||
|
||||
SFX R Y 1
|
||||
SFX R 0 able/PS . +DER_V_ADJ_ABLE
|
||||
|
||||
PFX h Y 1
|
||||
PFX h 0 de .
|
||||
|
||||
SFX A Y 1
|
||||
SFX A te tion/S .
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
2
|
||||
drink/RQ [verb]
|
||||
drink/S [noun]
|
||||
hydrate/hA
|
Loading…
Reference in New Issue