LUCENE-9776: Hunspell: allow to inflect the last part of COMPOUNDRULE compound (#2397)

This commit is contained in:
Peter Gromov 2021-02-19 20:03:34 +01:00 committed by GitHub
parent e7c80f6445
commit 4b3fb1e065
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 65 additions and 39 deletions

View File

@ -20,6 +20,7 @@ import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END;
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
import java.util.ArrayList;
@ -397,8 +398,7 @@ public class Hunspell {
if (forms != null) {
words.add(forms);
if (dictionary.compoundRules != null
&& dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) {
if (dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) {
if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) {
return true;
}
@ -417,13 +417,17 @@ public class Hunspell {
private boolean checkLastCompoundPart(
char[] wordChars, int start, int length, List<IntsRef> words) {
IntsRef forms = dictionary.lookupWord(wordChars, start, length);
if (forms == null) return false;
IntsRef ref = new IntsRef(new int[1], 0, 1);
words.add(ref);
words.add(forms);
boolean result = dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words));
Stemmer.RootProcessor stopOnMatching =
(stem, formID, morphDataId) -> {
ref.ints[0] = formID;
return dictionary.compoundRules.stream().noneMatch(r -> r.fullyMatches(words));
};
boolean found = !stemmer.doStem(wordChars, start, length, COMPOUND_RULE_END, stopOnMatching);
words.remove(words.size() - 1);
return result;
return found;
}
private static boolean isNumber(String s) {

View File

@ -247,19 +247,12 @@ final class Stemmer {
if (dictionary.hasFlag(entryId, dictionary.needaffix)) {
continue;
}
// we can't add this form, it only belongs inside a compound word
if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
continue;
if ((context == WordContext.COMPOUND_BEGIN || context == WordContext.COMPOUND_MIDDLE)
&& dictionary.hasFlag(entryId, dictionary.compoundForbid)) {
return false;
}
if (context.isCompound()) {
if (context != WordContext.COMPOUND_END
&& dictionary.hasFlag(entryId, dictionary.compoundForbid)) {
return false;
}
if (!dictionary.hasFlag(entryId, dictionary.compoundFlag)
&& !dictionary.hasFlag(entryId, context.requiredFlag(dictionary))) {
continue;
}
if (!isRootCompatibleWithContext(context, -1, entryId)) {
continue;
}
if (!callProcessor(word, offset, length, processor, forms, i)) {
return false;
@ -540,8 +533,8 @@ final class Stemmer {
if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid)) {
return false;
}
WordContext allowed = isPrefix ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_END;
if (context != allowed && !dictionary.hasFlag(append, dictionary.compoundPermit)) {
if (!context.isAffixAllowedWithoutSpecialPermit(isPrefix)
&& !dictionary.hasFlag(append, dictionary.compoundPermit)) {
return false;
}
if (context == WordContext.COMPOUND_END
@ -550,18 +543,17 @@ final class Stemmer {
&& dictionary.hasFlag(append, dictionary.onlyincompound)) {
return false;
}
} else if (dictionary.hasFlag(append, dictionary.onlyincompound)) {
return false;
}
if (recursionDepth == 0) {
// check if affix is allowed in a non-compound word
return context.isCompound() || !dictionary.hasFlag(append, dictionary.onlyincompound);
return true;
}
if (dictionary.isCrossProduct(affix)) {
// cross check incoming continuation class (flag of previous affix) against list.
if (context.isCompound() || !dictionary.hasFlag(append, dictionary.onlyincompound)) {
return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
}
return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
}
return false;
@ -640,18 +632,10 @@ final class Stemmer {
}
}
if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
if (!isRootCompatibleWithContext(context, affix, entryId)) {
continue;
}
if (context.isCompound()) {
char cFlag = context.requiredFlag(dictionary);
if (!dictionary.hasFlag(entryId, cFlag)
&& !isFlagAppendedByAffix(affix, cFlag)
&& !dictionary.hasFlag(entryId, dictionary.compoundFlag)
&& !isFlagAppendedByAffix(affix, dictionary.compoundFlag)) {
continue;
}
}
if (!callProcessor(strippedWord, offset, length, processor, forms, i)) {
return false;
}
@ -704,6 +688,20 @@ final class Stemmer {
return true;
}
private boolean isRootCompatibleWithContext(WordContext context, int lastAffix, int entryId) {
if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
return false;
}
if (context.isCompound() && context != WordContext.COMPOUND_RULE_END) {
char cFlag = context.requiredFlag(dictionary);
return dictionary.hasFlag(entryId, cFlag)
|| isFlagAppendedByAffix(lastAffix, cFlag)
|| dictionary.hasFlag(entryId, dictionary.compoundFlag)
|| isFlagAppendedByAffix(lastAffix, dictionary.compoundFlag);
}
return true;
}
private boolean callProcessor(
char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
CharsRef stem = new CharsRef(word, offset, length);

View File

@ -17,15 +17,35 @@
package org.apache.lucene.analysis.hunspell;
enum WordContext {
/** non-compound */
SIMPLE_WORD,
/** The first root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
COMPOUND_BEGIN,
/** A middle root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
COMPOUND_MIDDLE,
COMPOUND_END;
/** The final root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
COMPOUND_END,
/**
* The final root in a word with COMPOUNDRULE compounding. The difference to {@link #COMPOUND_END}
* is that this context doesn't require COMPOUNDFLAG/COMPOUNDEND flags, but allows ONLYINCOMPOUND.
*/
COMPOUND_RULE_END;
boolean isCompound() {
return this != SIMPLE_WORD;
}
boolean isAffixAllowedWithoutSpecialPermit(boolean isPrefix) {
if (isPrefix) {
return this == WordContext.COMPOUND_BEGIN;
}
return this == WordContext.COMPOUND_END || this == WordContext.COMPOUND_RULE_END;
}
char requiredFlag(Dictionary dictionary) {
switch (this) {
case COMPOUND_BEGIN:

View File

@ -5,3 +5,6 @@ ONLYINCOMPOUND c
COMPOUNDRULE 2
COMPOUNDRULE n*1t
COMPOUNDRULE n*mp
SFX S Y 1
SFX S 0 s

View File

@ -9,7 +9,7 @@
7/nm
8/nm
9/nm
0th/pt
0th/ptS
1st/p
1th/tc
2nd/p

View File

@ -28,4 +28,5 @@
10001st
10011th
1ST
42ND
42ND
10ths