LUCENE-9776: Hunspell: allow to inflect the last part of COMPOUNDRULE compound (#2397)

This commit is contained in:
Peter Gromov 2021-02-19 20:03:34 +01:00 committed by GitHub
parent e7c80f6445
commit 4b3fb1e065
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 65 additions and 39 deletions

View File

@ -20,6 +20,7 @@ import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN; import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END; import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE; import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END;
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD; import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
import java.util.ArrayList; import java.util.ArrayList;
@ -397,8 +398,7 @@ public class Hunspell {
if (forms != null) { if (forms != null) {
words.add(forms); words.add(forms);
if (dictionary.compoundRules != null if (dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) {
&& dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) {
if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) { if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) {
return true; return true;
} }
@ -417,13 +417,17 @@ public class Hunspell {
private boolean checkLastCompoundPart( private boolean checkLastCompoundPart(
char[] wordChars, int start, int length, List<IntsRef> words) { char[] wordChars, int start, int length, List<IntsRef> words) {
IntsRef forms = dictionary.lookupWord(wordChars, start, length); IntsRef ref = new IntsRef(new int[1], 0, 1);
if (forms == null) return false; words.add(ref);
words.add(forms); Stemmer.RootProcessor stopOnMatching =
boolean result = dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words)); (stem, formID, morphDataId) -> {
ref.ints[0] = formID;
return dictionary.compoundRules.stream().noneMatch(r -> r.fullyMatches(words));
};
boolean found = !stemmer.doStem(wordChars, start, length, COMPOUND_RULE_END, stopOnMatching);
words.remove(words.size() - 1); words.remove(words.size() - 1);
return result; return found;
} }
private static boolean isNumber(String s) { private static boolean isNumber(String s) {

View File

@ -247,19 +247,12 @@ final class Stemmer {
if (dictionary.hasFlag(entryId, dictionary.needaffix)) { if (dictionary.hasFlag(entryId, dictionary.needaffix)) {
continue; continue;
} }
// we can't add this form, it only belongs inside a compound word if ((context == WordContext.COMPOUND_BEGIN || context == WordContext.COMPOUND_MIDDLE)
if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) { && dictionary.hasFlag(entryId, dictionary.compoundForbid)) {
continue; return false;
} }
if (context.isCompound()) { if (!isRootCompatibleWithContext(context, -1, entryId)) {
if (context != WordContext.COMPOUND_END continue;
&& dictionary.hasFlag(entryId, dictionary.compoundForbid)) {
return false;
}
if (!dictionary.hasFlag(entryId, dictionary.compoundFlag)
&& !dictionary.hasFlag(entryId, context.requiredFlag(dictionary))) {
continue;
}
} }
if (!callProcessor(word, offset, length, processor, forms, i)) { if (!callProcessor(word, offset, length, processor, forms, i)) {
return false; return false;
@ -540,8 +533,8 @@ final class Stemmer {
if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid)) { if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid)) {
return false; return false;
} }
WordContext allowed = isPrefix ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_END; if (!context.isAffixAllowedWithoutSpecialPermit(isPrefix)
if (context != allowed && !dictionary.hasFlag(append, dictionary.compoundPermit)) { && !dictionary.hasFlag(append, dictionary.compoundPermit)) {
return false; return false;
} }
if (context == WordContext.COMPOUND_END if (context == WordContext.COMPOUND_END
@ -550,18 +543,17 @@ final class Stemmer {
&& dictionary.hasFlag(append, dictionary.onlyincompound)) { && dictionary.hasFlag(append, dictionary.onlyincompound)) {
return false; return false;
} }
} else if (dictionary.hasFlag(append, dictionary.onlyincompound)) {
return false;
} }
if (recursionDepth == 0) { if (recursionDepth == 0) {
// check if affix is allowed in a non-compound word return true;
return context.isCompound() || !dictionary.hasFlag(append, dictionary.onlyincompound);
} }
if (dictionary.isCrossProduct(affix)) { if (dictionary.isCrossProduct(affix)) {
// cross check incoming continuation class (flag of previous affix) against list. // cross check incoming continuation class (flag of previous affix) against list.
if (context.isCompound() || !dictionary.hasFlag(append, dictionary.onlyincompound)) { return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
}
} }
return false; return false;
@ -640,18 +632,10 @@ final class Stemmer {
} }
} }
if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) { if (!isRootCompatibleWithContext(context, affix, entryId)) {
continue; continue;
} }
if (context.isCompound()) {
char cFlag = context.requiredFlag(dictionary);
if (!dictionary.hasFlag(entryId, cFlag)
&& !isFlagAppendedByAffix(affix, cFlag)
&& !dictionary.hasFlag(entryId, dictionary.compoundFlag)
&& !isFlagAppendedByAffix(affix, dictionary.compoundFlag)) {
continue;
}
}
if (!callProcessor(strippedWord, offset, length, processor, forms, i)) { if (!callProcessor(strippedWord, offset, length, processor, forms, i)) {
return false; return false;
} }
@ -704,6 +688,20 @@ final class Stemmer {
return true; return true;
} }
private boolean isRootCompatibleWithContext(WordContext context, int lastAffix, int entryId) {
if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
return false;
}
if (context.isCompound() && context != WordContext.COMPOUND_RULE_END) {
char cFlag = context.requiredFlag(dictionary);
return dictionary.hasFlag(entryId, cFlag)
|| isFlagAppendedByAffix(lastAffix, cFlag)
|| dictionary.hasFlag(entryId, dictionary.compoundFlag)
|| isFlagAppendedByAffix(lastAffix, dictionary.compoundFlag);
}
return true;
}
private boolean callProcessor( private boolean callProcessor(
char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) { char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
CharsRef stem = new CharsRef(word, offset, length); CharsRef stem = new CharsRef(word, offset, length);

View File

@ -17,15 +17,35 @@
package org.apache.lucene.analysis.hunspell; package org.apache.lucene.analysis.hunspell;
enum WordContext { enum WordContext {
/** non-compound */
SIMPLE_WORD, SIMPLE_WORD,
/** The first root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
COMPOUND_BEGIN, COMPOUND_BEGIN,
/** A middle root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
COMPOUND_MIDDLE, COMPOUND_MIDDLE,
COMPOUND_END;
/** The final root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
COMPOUND_END,
/**
* The final root in a word with COMPOUNDRULE compounding. The difference to {@link #COMPOUND_END}
* is that this context doesn't require COMPOUNDFLAG/COMPOUNDEND flags, but allows ONLYINCOMPOUND.
*/
COMPOUND_RULE_END;
boolean isCompound() { boolean isCompound() {
return this != SIMPLE_WORD; return this != SIMPLE_WORD;
} }
boolean isAffixAllowedWithoutSpecialPermit(boolean isPrefix) {
if (isPrefix) {
return this == WordContext.COMPOUND_BEGIN;
}
return this == WordContext.COMPOUND_END || this == WordContext.COMPOUND_RULE_END;
}
char requiredFlag(Dictionary dictionary) { char requiredFlag(Dictionary dictionary) {
switch (this) { switch (this) {
case COMPOUND_BEGIN: case COMPOUND_BEGIN:

View File

@ -5,3 +5,6 @@ ONLYINCOMPOUND c
COMPOUNDRULE 2 COMPOUNDRULE 2
COMPOUNDRULE n*1t COMPOUNDRULE n*1t
COMPOUNDRULE n*mp COMPOUNDRULE n*mp
SFX S Y 1
SFX S 0 s

View File

@ -9,7 +9,7 @@
7/nm 7/nm
8/nm 8/nm
9/nm 9/nm
0th/pt 0th/ptS
1st/p 1st/p
1th/tc 1th/tc
2nd/p 2nd/p

View File

@ -28,4 +28,5 @@
10001st 10001st
10011th 10011th
1ST 1ST
42ND 42ND
10ths