mirror of https://github.com/apache/lucene.git
LUCENE-9776: Hunspell: allow to inflect the last part of COMPOUNDRULE compound (#2397)
This commit is contained in:
parent
e7c80f6445
commit
4b3fb1e065
|
@ -20,6 +20,7 @@ import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
|
||||||
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
|
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
|
||||||
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
|
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
|
||||||
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
|
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
|
||||||
|
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END;
|
||||||
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
|
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -397,8 +398,7 @@ public class Hunspell {
|
||||||
if (forms != null) {
|
if (forms != null) {
|
||||||
words.add(forms);
|
words.add(forms);
|
||||||
|
|
||||||
if (dictionary.compoundRules != null
|
if (dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) {
|
||||||
&& dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) {
|
|
||||||
if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) {
|
if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -417,13 +417,17 @@ public class Hunspell {
|
||||||
|
|
||||||
private boolean checkLastCompoundPart(
|
private boolean checkLastCompoundPart(
|
||||||
char[] wordChars, int start, int length, List<IntsRef> words) {
|
char[] wordChars, int start, int length, List<IntsRef> words) {
|
||||||
IntsRef forms = dictionary.lookupWord(wordChars, start, length);
|
IntsRef ref = new IntsRef(new int[1], 0, 1);
|
||||||
if (forms == null) return false;
|
words.add(ref);
|
||||||
|
|
||||||
words.add(forms);
|
Stemmer.RootProcessor stopOnMatching =
|
||||||
boolean result = dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words));
|
(stem, formID, morphDataId) -> {
|
||||||
|
ref.ints[0] = formID;
|
||||||
|
return dictionary.compoundRules.stream().noneMatch(r -> r.fullyMatches(words));
|
||||||
|
};
|
||||||
|
boolean found = !stemmer.doStem(wordChars, start, length, COMPOUND_RULE_END, stopOnMatching);
|
||||||
words.remove(words.size() - 1);
|
words.remove(words.size() - 1);
|
||||||
return result;
|
return found;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean isNumber(String s) {
|
private static boolean isNumber(String s) {
|
||||||
|
|
|
@ -247,19 +247,12 @@ final class Stemmer {
|
||||||
if (dictionary.hasFlag(entryId, dictionary.needaffix)) {
|
if (dictionary.hasFlag(entryId, dictionary.needaffix)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// we can't add this form, it only belongs inside a compound word
|
if ((context == WordContext.COMPOUND_BEGIN || context == WordContext.COMPOUND_MIDDLE)
|
||||||
if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
|
&& dictionary.hasFlag(entryId, dictionary.compoundForbid)) {
|
||||||
continue;
|
return false;
|
||||||
}
|
}
|
||||||
if (context.isCompound()) {
|
if (!isRootCompatibleWithContext(context, -1, entryId)) {
|
||||||
if (context != WordContext.COMPOUND_END
|
continue;
|
||||||
&& dictionary.hasFlag(entryId, dictionary.compoundForbid)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (!dictionary.hasFlag(entryId, dictionary.compoundFlag)
|
|
||||||
&& !dictionary.hasFlag(entryId, context.requiredFlag(dictionary))) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (!callProcessor(word, offset, length, processor, forms, i)) {
|
if (!callProcessor(word, offset, length, processor, forms, i)) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -540,8 +533,8 @@ final class Stemmer {
|
||||||
if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid)) {
|
if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
WordContext allowed = isPrefix ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_END;
|
if (!context.isAffixAllowedWithoutSpecialPermit(isPrefix)
|
||||||
if (context != allowed && !dictionary.hasFlag(append, dictionary.compoundPermit)) {
|
&& !dictionary.hasFlag(append, dictionary.compoundPermit)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (context == WordContext.COMPOUND_END
|
if (context == WordContext.COMPOUND_END
|
||||||
|
@ -550,18 +543,17 @@ final class Stemmer {
|
||||||
&& dictionary.hasFlag(append, dictionary.onlyincompound)) {
|
&& dictionary.hasFlag(append, dictionary.onlyincompound)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
} else if (dictionary.hasFlag(append, dictionary.onlyincompound)) {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (recursionDepth == 0) {
|
if (recursionDepth == 0) {
|
||||||
// check if affix is allowed in a non-compound word
|
return true;
|
||||||
return context.isCompound() || !dictionary.hasFlag(append, dictionary.onlyincompound);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dictionary.isCrossProduct(affix)) {
|
if (dictionary.isCrossProduct(affix)) {
|
||||||
// cross check incoming continuation class (flag of previous affix) against list.
|
// cross check incoming continuation class (flag of previous affix) against list.
|
||||||
if (context.isCompound() || !dictionary.hasFlag(append, dictionary.onlyincompound)) {
|
return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
|
||||||
return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
|
@ -640,18 +632,10 @@ final class Stemmer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
|
if (!isRootCompatibleWithContext(context, affix, entryId)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (context.isCompound()) {
|
|
||||||
char cFlag = context.requiredFlag(dictionary);
|
|
||||||
if (!dictionary.hasFlag(entryId, cFlag)
|
|
||||||
&& !isFlagAppendedByAffix(affix, cFlag)
|
|
||||||
&& !dictionary.hasFlag(entryId, dictionary.compoundFlag)
|
|
||||||
&& !isFlagAppendedByAffix(affix, dictionary.compoundFlag)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!callProcessor(strippedWord, offset, length, processor, forms, i)) {
|
if (!callProcessor(strippedWord, offset, length, processor, forms, i)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -704,6 +688,20 @@ final class Stemmer {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean isRootCompatibleWithContext(WordContext context, int lastAffix, int entryId) {
|
||||||
|
if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (context.isCompound() && context != WordContext.COMPOUND_RULE_END) {
|
||||||
|
char cFlag = context.requiredFlag(dictionary);
|
||||||
|
return dictionary.hasFlag(entryId, cFlag)
|
||||||
|
|| isFlagAppendedByAffix(lastAffix, cFlag)
|
||||||
|
|| dictionary.hasFlag(entryId, dictionary.compoundFlag)
|
||||||
|
|| isFlagAppendedByAffix(lastAffix, dictionary.compoundFlag);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
private boolean callProcessor(
|
private boolean callProcessor(
|
||||||
char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
|
char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
|
||||||
CharsRef stem = new CharsRef(word, offset, length);
|
CharsRef stem = new CharsRef(word, offset, length);
|
||||||
|
|
|
@ -17,15 +17,35 @@
|
||||||
package org.apache.lucene.analysis.hunspell;
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
enum WordContext {
|
enum WordContext {
|
||||||
|
/** non-compound */
|
||||||
SIMPLE_WORD,
|
SIMPLE_WORD,
|
||||||
|
|
||||||
|
/** The first root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
|
||||||
COMPOUND_BEGIN,
|
COMPOUND_BEGIN,
|
||||||
|
|
||||||
|
/** A middle root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
|
||||||
COMPOUND_MIDDLE,
|
COMPOUND_MIDDLE,
|
||||||
COMPOUND_END;
|
|
||||||
|
/** The final root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
|
||||||
|
COMPOUND_END,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The final root in a word with COMPOUNDRULE compounding. The difference to {@link #COMPOUND_END}
|
||||||
|
* is that this context doesn't require COMPOUNDFLAG/COMPOUNDEND flags, but allows ONLYINCOMPOUND.
|
||||||
|
*/
|
||||||
|
COMPOUND_RULE_END;
|
||||||
|
|
||||||
boolean isCompound() {
|
boolean isCompound() {
|
||||||
return this != SIMPLE_WORD;
|
return this != SIMPLE_WORD;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
boolean isAffixAllowedWithoutSpecialPermit(boolean isPrefix) {
|
||||||
|
if (isPrefix) {
|
||||||
|
return this == WordContext.COMPOUND_BEGIN;
|
||||||
|
}
|
||||||
|
return this == WordContext.COMPOUND_END || this == WordContext.COMPOUND_RULE_END;
|
||||||
|
}
|
||||||
|
|
||||||
char requiredFlag(Dictionary dictionary) {
|
char requiredFlag(Dictionary dictionary) {
|
||||||
switch (this) {
|
switch (this) {
|
||||||
case COMPOUND_BEGIN:
|
case COMPOUND_BEGIN:
|
||||||
|
|
|
@ -5,3 +5,6 @@ ONLYINCOMPOUND c
|
||||||
COMPOUNDRULE 2
|
COMPOUNDRULE 2
|
||||||
COMPOUNDRULE n*1t
|
COMPOUNDRULE n*1t
|
||||||
COMPOUNDRULE n*mp
|
COMPOUNDRULE n*mp
|
||||||
|
|
||||||
|
SFX S Y 1
|
||||||
|
SFX S 0 s
|
|
@ -9,7 +9,7 @@
|
||||||
7/nm
|
7/nm
|
||||||
8/nm
|
8/nm
|
||||||
9/nm
|
9/nm
|
||||||
0th/pt
|
0th/ptS
|
||||||
1st/p
|
1st/p
|
||||||
1th/tc
|
1th/tc
|
||||||
2nd/p
|
2nd/p
|
||||||
|
|
|
@ -28,4 +28,5 @@
|
||||||
10001st
|
10001st
|
||||||
10011th
|
10011th
|
||||||
1ST
|
1ST
|
||||||
42ND
|
42ND
|
||||||
|
10ths
|
Loading…
Reference in New Issue