mirror of https://github.com/apache/lucene.git
LUCENE-9776: Hunspell: allow to inflect the last part of COMPOUNDRULE compound (#2397)
This commit is contained in:
parent
e7c80f6445
commit
4b3fb1e065
|
@ -20,6 +20,7 @@ import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
|
|||
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
|
||||
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
|
||||
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
|
||||
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END;
|
||||
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
@ -397,8 +398,7 @@ public class Hunspell {
|
|||
if (forms != null) {
|
||||
words.add(forms);
|
||||
|
||||
if (dictionary.compoundRules != null
|
||||
&& dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) {
|
||||
if (dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) {
|
||||
if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) {
|
||||
return true;
|
||||
}
|
||||
|
@ -417,13 +417,17 @@ public class Hunspell {
|
|||
|
||||
private boolean checkLastCompoundPart(
|
||||
char[] wordChars, int start, int length, List<IntsRef> words) {
|
||||
IntsRef forms = dictionary.lookupWord(wordChars, start, length);
|
||||
if (forms == null) return false;
|
||||
IntsRef ref = new IntsRef(new int[1], 0, 1);
|
||||
words.add(ref);
|
||||
|
||||
words.add(forms);
|
||||
boolean result = dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words));
|
||||
Stemmer.RootProcessor stopOnMatching =
|
||||
(stem, formID, morphDataId) -> {
|
||||
ref.ints[0] = formID;
|
||||
return dictionary.compoundRules.stream().noneMatch(r -> r.fullyMatches(words));
|
||||
};
|
||||
boolean found = !stemmer.doStem(wordChars, start, length, COMPOUND_RULE_END, stopOnMatching);
|
||||
words.remove(words.size() - 1);
|
||||
return result;
|
||||
return found;
|
||||
}
|
||||
|
||||
private static boolean isNumber(String s) {
|
||||
|
|
|
@ -247,19 +247,12 @@ final class Stemmer {
|
|||
if (dictionary.hasFlag(entryId, dictionary.needaffix)) {
|
||||
continue;
|
||||
}
|
||||
// we can't add this form, it only belongs inside a compound word
|
||||
if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
|
||||
continue;
|
||||
if ((context == WordContext.COMPOUND_BEGIN || context == WordContext.COMPOUND_MIDDLE)
|
||||
&& dictionary.hasFlag(entryId, dictionary.compoundForbid)) {
|
||||
return false;
|
||||
}
|
||||
if (context.isCompound()) {
|
||||
if (context != WordContext.COMPOUND_END
|
||||
&& dictionary.hasFlag(entryId, dictionary.compoundForbid)) {
|
||||
return false;
|
||||
}
|
||||
if (!dictionary.hasFlag(entryId, dictionary.compoundFlag)
|
||||
&& !dictionary.hasFlag(entryId, context.requiredFlag(dictionary))) {
|
||||
continue;
|
||||
}
|
||||
if (!isRootCompatibleWithContext(context, -1, entryId)) {
|
||||
continue;
|
||||
}
|
||||
if (!callProcessor(word, offset, length, processor, forms, i)) {
|
||||
return false;
|
||||
|
@ -540,8 +533,8 @@ final class Stemmer {
|
|||
if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid)) {
|
||||
return false;
|
||||
}
|
||||
WordContext allowed = isPrefix ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_END;
|
||||
if (context != allowed && !dictionary.hasFlag(append, dictionary.compoundPermit)) {
|
||||
if (!context.isAffixAllowedWithoutSpecialPermit(isPrefix)
|
||||
&& !dictionary.hasFlag(append, dictionary.compoundPermit)) {
|
||||
return false;
|
||||
}
|
||||
if (context == WordContext.COMPOUND_END
|
||||
|
@ -550,18 +543,17 @@ final class Stemmer {
|
|||
&& dictionary.hasFlag(append, dictionary.onlyincompound)) {
|
||||
return false;
|
||||
}
|
||||
} else if (dictionary.hasFlag(append, dictionary.onlyincompound)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (recursionDepth == 0) {
|
||||
// check if affix is allowed in a non-compound word
|
||||
return context.isCompound() || !dictionary.hasFlag(append, dictionary.onlyincompound);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (dictionary.isCrossProduct(affix)) {
|
||||
// cross check incoming continuation class (flag of previous affix) against list.
|
||||
if (context.isCompound() || !dictionary.hasFlag(append, dictionary.onlyincompound)) {
|
||||
return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
|
||||
}
|
||||
return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
|
||||
}
|
||||
|
||||
return false;
|
||||
|
@ -640,18 +632,10 @@ final class Stemmer {
|
|||
}
|
||||
}
|
||||
|
||||
if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
|
||||
if (!isRootCompatibleWithContext(context, affix, entryId)) {
|
||||
continue;
|
||||
}
|
||||
if (context.isCompound()) {
|
||||
char cFlag = context.requiredFlag(dictionary);
|
||||
if (!dictionary.hasFlag(entryId, cFlag)
|
||||
&& !isFlagAppendedByAffix(affix, cFlag)
|
||||
&& !dictionary.hasFlag(entryId, dictionary.compoundFlag)
|
||||
&& !isFlagAppendedByAffix(affix, dictionary.compoundFlag)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (!callProcessor(strippedWord, offset, length, processor, forms, i)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -704,6 +688,20 @@ final class Stemmer {
|
|||
return true;
|
||||
}
|
||||
|
||||
private boolean isRootCompatibleWithContext(WordContext context, int lastAffix, int entryId) {
|
||||
if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
|
||||
return false;
|
||||
}
|
||||
if (context.isCompound() && context != WordContext.COMPOUND_RULE_END) {
|
||||
char cFlag = context.requiredFlag(dictionary);
|
||||
return dictionary.hasFlag(entryId, cFlag)
|
||||
|| isFlagAppendedByAffix(lastAffix, cFlag)
|
||||
|| dictionary.hasFlag(entryId, dictionary.compoundFlag)
|
||||
|| isFlagAppendedByAffix(lastAffix, dictionary.compoundFlag);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean callProcessor(
|
||||
char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
|
||||
CharsRef stem = new CharsRef(word, offset, length);
|
||||
|
|
|
@ -17,15 +17,35 @@
|
|||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
enum WordContext {
|
||||
/** non-compound */
|
||||
SIMPLE_WORD,
|
||||
|
||||
/** The first root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
|
||||
COMPOUND_BEGIN,
|
||||
|
||||
/** A middle root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
|
||||
COMPOUND_MIDDLE,
|
||||
COMPOUND_END;
|
||||
|
||||
/** The final root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
|
||||
COMPOUND_END,
|
||||
|
||||
/**
|
||||
* The final root in a word with COMPOUNDRULE compounding. The difference to {@link #COMPOUND_END}
|
||||
* is that this context doesn't require COMPOUNDFLAG/COMPOUNDEND flags, but allows ONLYINCOMPOUND.
|
||||
*/
|
||||
COMPOUND_RULE_END;
|
||||
|
||||
boolean isCompound() {
|
||||
return this != SIMPLE_WORD;
|
||||
}
|
||||
|
||||
boolean isAffixAllowedWithoutSpecialPermit(boolean isPrefix) {
|
||||
if (isPrefix) {
|
||||
return this == WordContext.COMPOUND_BEGIN;
|
||||
}
|
||||
return this == WordContext.COMPOUND_END || this == WordContext.COMPOUND_RULE_END;
|
||||
}
|
||||
|
||||
char requiredFlag(Dictionary dictionary) {
|
||||
switch (this) {
|
||||
case COMPOUND_BEGIN:
|
||||
|
|
|
@ -5,3 +5,6 @@ ONLYINCOMPOUND c
|
|||
COMPOUNDRULE 2
|
||||
COMPOUNDRULE n*1t
|
||||
COMPOUNDRULE n*mp
|
||||
|
||||
SFX S Y 1
|
||||
SFX S 0 s
|
|
@ -9,7 +9,7 @@
|
|||
7/nm
|
||||
8/nm
|
||||
9/nm
|
||||
0th/pt
|
||||
0th/ptS
|
||||
1st/p
|
||||
1th/tc
|
||||
2nd/p
|
||||
|
|
|
@ -28,4 +28,5 @@
|
|||
10001st
|
||||
10011th
|
||||
1ST
|
||||
42ND
|
||||
42ND
|
||||
10ths
|
Loading…
Reference in New Issue