diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java index ea486da3b34..1f1f9ba77d5 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java @@ -20,6 +20,7 @@ import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET; import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN; import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END; import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE; +import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END; import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD; import java.util.ArrayList; @@ -397,8 +398,7 @@ public class Hunspell { if (forms != null) { words.add(forms); - if (dictionary.compoundRules != null - && dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) { + if (dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) { if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) { return true; } @@ -417,13 +417,17 @@ public class Hunspell { private boolean checkLastCompoundPart( char[] wordChars, int start, int length, List words) { - IntsRef forms = dictionary.lookupWord(wordChars, start, length); - if (forms == null) return false; + IntsRef ref = new IntsRef(new int[1], 0, 1); + words.add(ref); - words.add(forms); - boolean result = dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words)); + Stemmer.RootProcessor stopOnMatching = + (stem, formID, morphDataId) -> { + ref.ints[0] = formID; + return dictionary.compoundRules.stream().noneMatch(r -> r.fullyMatches(words)); + }; + boolean found = !stemmer.doStem(wordChars, start, length, COMPOUND_RULE_END, stopOnMatching); words.remove(words.size() - 1); - return result; + return found; } private static boolean isNumber(String s) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index b84050e4711..41648925951 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -247,19 +247,12 @@ final class Stemmer { if (dictionary.hasFlag(entryId, dictionary.needaffix)) { continue; } - // we can't add this form, it only belongs inside a compound word - if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) { - continue; + if ((context == WordContext.COMPOUND_BEGIN || context == WordContext.COMPOUND_MIDDLE) + && dictionary.hasFlag(entryId, dictionary.compoundForbid)) { + return false; } - if (context.isCompound()) { - if (context != WordContext.COMPOUND_END - && dictionary.hasFlag(entryId, dictionary.compoundForbid)) { - return false; - } - if (!dictionary.hasFlag(entryId, dictionary.compoundFlag) - && !dictionary.hasFlag(entryId, context.requiredFlag(dictionary))) { - continue; - } + if (!isRootCompatibleWithContext(context, -1, entryId)) { + continue; } if (!callProcessor(word, offset, length, processor, forms, i)) { return false; @@ -540,8 +533,8 @@ final class Stemmer { if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid)) { return false; } - WordContext allowed = isPrefix ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_END; - if (context != allowed && !dictionary.hasFlag(append, dictionary.compoundPermit)) { + if (!context.isAffixAllowedWithoutSpecialPermit(isPrefix) + && !dictionary.hasFlag(append, dictionary.compoundPermit)) { return false; } if (context == WordContext.COMPOUND_END @@ -550,18 +543,17 @@ final class Stemmer { && dictionary.hasFlag(append, dictionary.onlyincompound)) { return false; } + } else if (dictionary.hasFlag(append, dictionary.onlyincompound)) { + return false; } if (recursionDepth == 0) { - // check if affix is allowed in a non-compound word - return context.isCompound() || !dictionary.hasFlag(append, dictionary.onlyincompound); + return true; } if (dictionary.isCrossProduct(affix)) { // cross check incoming continuation class (flag of previous affix) against list. - if (context.isCompound() || !dictionary.hasFlag(append, dictionary.onlyincompound)) { - return previousWasPrefix || dictionary.hasFlag(append, prevFlag); - } + return previousWasPrefix || dictionary.hasFlag(append, prevFlag); } return false; @@ -640,18 +632,10 @@ final class Stemmer { } } - if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) { + if (!isRootCompatibleWithContext(context, affix, entryId)) { continue; } - if (context.isCompound()) { - char cFlag = context.requiredFlag(dictionary); - if (!dictionary.hasFlag(entryId, cFlag) - && !isFlagAppendedByAffix(affix, cFlag) - && !dictionary.hasFlag(entryId, dictionary.compoundFlag) - && !isFlagAppendedByAffix(affix, dictionary.compoundFlag)) { - continue; - } - } + if (!callProcessor(strippedWord, offset, length, processor, forms, i)) { return false; } @@ -704,6 +688,20 @@ final class Stemmer { return true; } + private boolean isRootCompatibleWithContext(WordContext context, int lastAffix, int entryId) { + if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) { + return false; + } + if (context.isCompound() && context != WordContext.COMPOUND_RULE_END) { + char cFlag = context.requiredFlag(dictionary); + return dictionary.hasFlag(entryId, cFlag) + || isFlagAppendedByAffix(lastAffix, cFlag) + || dictionary.hasFlag(entryId, dictionary.compoundFlag) + || isFlagAppendedByAffix(lastAffix, dictionary.compoundFlag); + } + return true; + } + private boolean callProcessor( char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) { CharsRef stem = new CharsRef(word, offset, length); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordContext.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordContext.java index 4dd6e0e9928..6c7b159010a 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordContext.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordContext.java @@ -17,15 +17,35 @@ package org.apache.lucene.analysis.hunspell; enum WordContext { + /** non-compound */ SIMPLE_WORD, + + /** The first root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */ COMPOUND_BEGIN, + + /** A middle root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */ COMPOUND_MIDDLE, - COMPOUND_END; + + /** The final root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */ + COMPOUND_END, + + /** + * The final root in a word with COMPOUNDRULE compounding. The difference to {@link #COMPOUND_END} + * is that this context doesn't require COMPOUNDFLAG/COMPOUNDEND flags, but allows ONLYINCOMPOUND. + */ + COMPOUND_RULE_END; boolean isCompound() { return this != SIMPLE_WORD; } + boolean isAffixAllowedWithoutSpecialPermit(boolean isPrefix) { + if (isPrefix) { + return this == WordContext.COMPOUND_BEGIN; + } + return this == WordContext.COMPOUND_END || this == WordContext.COMPOUND_RULE_END; + } + char requiredFlag(Dictionary dictionary) { switch (this) { case COMPOUND_BEGIN: diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff index 8a9996cb3e2..4fc04833473 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff @@ -5,3 +5,6 @@ ONLYINCOMPOUND c COMPOUNDRULE 2 COMPOUNDRULE n*1t COMPOUNDRULE n*mp + +SFX S Y 1 +SFX S 0 s \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic index ced0735ec1e..5eeebdd9618 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic @@ -9,7 +9,7 @@ 7/nm 8/nm 9/nm -0th/pt +0th/ptS 1st/p 1th/tc 2nd/p diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good index 86949437d38..3aee03b3f1e 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good @@ -28,4 +28,5 @@ 10001st 10011th 1ST -42ND \ No newline at end of file +42ND +10ths \ No newline at end of file