diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 5b674e22b02..d9473a9c681 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -141,8 +141,9 @@ public class Dictionary { char keepcase; char needaffix; char forbiddenword; - char onlyincompound; - int compoundMin = 3; + char onlyincompound, compoundBegin, compoundMiddle, compoundEnd, compoundPermit; + boolean checkCompoundCase; + int compoundMin = 3, compoundMax = Integer.MAX_VALUE; List compoundRules; // nullable // ignored characters (dictionary, affix, inputs) @@ -377,8 +378,20 @@ public class Dictionary { forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line)); } else if ("COMPOUNDMIN".equals(firstWord)) { compoundMin = Math.max(1, Integer.parseInt(singleArgument(reader, line))); + } else if ("COMPOUNDWORDMAX".equals(firstWord)) { + compoundMax = Math.max(1, Integer.parseInt(singleArgument(reader, line))); } else if ("COMPOUNDRULE".equals(firstWord)) { compoundRules = parseCompoundRules(reader, Integer.parseInt(singleArgument(reader, line))); + } else if ("COMPOUNDBEGIN".equals(firstWord)) { + compoundBegin = flagParsingStrategy.parseFlag(singleArgument(reader, line)); + } else if ("COMPOUNDMIDDLE".equals(firstWord)) { + compoundMiddle = flagParsingStrategy.parseFlag(singleArgument(reader, line)); + } else if ("COMPOUNDEND".equals(firstWord)) { + compoundEnd = flagParsingStrategy.parseFlag(singleArgument(reader, line)); + } else if ("COMPOUNDPERMITFLAG".equals(firstWord)) { + compoundPermit = flagParsingStrategy.parseFlag(singleArgument(reader, line)); + } else if ("CHECKCOMPOUNDCASE".equals(firstWord)) { + checkCompoundCase = true; } } @@ -1303,10 +1316,6 @@ public class Dictionary { } } - boolean hasCompounding() { - return compoundRules != null; - } - boolean hasFlag(int entryId, char flag, BytesRef scratch) { return flag != FLAG_UNSET && hasFlag(decodeFlags(entryId, scratch), flag); } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java index b9f29a397a2..4056db6ea16 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java @@ -87,18 +87,54 @@ public class SpellChecker { return false; } - if (!stemmer.doStem(wordChars, length, caseVariant).isEmpty()) { + if (hasStems(wordChars, 0, length, caseVariant, WordContext.SIMPLE_WORD)) { return true; } - if (dictionary.hasCompounding()) { - return checkCompounds(wordChars, 0, length, new ArrayList<>()); + if (dictionary.compoundRules != null + && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) { + return true; + } + + return dictionary.compoundBegin > 0 && checkCompounds(wordChars, 0, length, caseVariant, 0); + } + + private boolean hasStems( + char[] chars, int offset, int length, boolean caseVariant, WordContext context) { + return !stemmer.doStem(chars, offset, length, caseVariant, context).isEmpty(); + } + + private boolean checkCompounds( + char[] chars, int offset, int length, boolean caseVariant, int depth) { + if (depth > dictionary.compoundMax - 2) return false; + + int limit = length - dictionary.compoundMin + 1; + for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) { + WordContext context = depth == 0 ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_MIDDLE; + int breakOffset = offset + breakPos; + if (checkCompoundCase(chars, breakOffset) + && hasStems(chars, offset, breakPos, caseVariant, context)) { + int remainingLength = length - breakPos; + if (hasStems(chars, breakOffset, remainingLength, caseVariant, WordContext.COMPOUND_END)) { + return true; + } + + if (checkCompounds(chars, breakOffset, remainingLength, caseVariant, depth + 1)) { + return true; + } + } } return false; } - private boolean checkCompounds(char[] wordChars, int offset, int length, List words) { + private boolean checkCompoundCase(char[] chars, int breakPos) { + if (!dictionary.checkCompoundCase) return true; + return Character.isUpperCase(chars[breakPos - 1]) == Character.isUpperCase(chars[breakPos]); + } + + private boolean checkCompoundRules( + char[] wordChars, int offset, int length, List words) { if (words.size() >= 100) return false; int limit = length - dictionary.compoundMin + 1; @@ -113,7 +149,7 @@ public class SpellChecker { return true; } - if (checkCompounds(wordChars, offset + breakPos, length - breakPos, words)) { + if (checkCompoundRules(wordChars, offset + breakPos, length - breakPos, words)) { return true; } } @@ -132,8 +168,7 @@ public class SpellChecker { words.add(forms); boolean result = - dictionary.compoundRules != null - && dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch)); + dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch)); words.remove(words.size() - 1); return result; } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index 4a337fbce57..6b6fb80503a 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -94,18 +94,18 @@ final class Stemmer { } WordCase wordCase = caseOf(word, length); - List list = doStem(word, length, false); + List list = doStem(word, 0, length, false, WordContext.SIMPLE_WORD); if (wordCase == WordCase.UPPER) { caseFoldTitle(word, length); char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length); if (aposCase != null) { - list.addAll(doStem(aposCase, length, true)); + list.addAll(doStem(aposCase, 0, length, true, WordContext.SIMPLE_WORD)); } - list.addAll(doStem(titleBuffer, length, true)); + list.addAll(doStem(titleBuffer, 0, length, true, WordContext.SIMPLE_WORD)); } if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) { caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length); - list.addAll(doStem(lowerBuffer, length, true)); + list.addAll(doStem(lowerBuffer, 0, length, true, WordContext.SIMPLE_WORD)); } return list; } @@ -158,9 +158,10 @@ final class Stemmer { return null; } - List doStem(char[] word, int length, boolean caseVariant) { + List doStem( + char[] word, int offset, int length, boolean caseVariant, WordContext context) { List stems = new ArrayList<>(); - IntsRef forms = dictionary.lookupWord(word, 0, length); + IntsRef forms = dictionary.lookupWord(word, offset, length); if (forms != null) { for (int i = 0; i < forms.length; i += formStep) { char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch); @@ -172,15 +173,32 @@ final class Stemmer { continue; } // we can't add this form, it only belongs inside a compound word - if (Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) { + if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) { continue; } - stems.add(newStem(word, 0, length, forms, i)); + if (context.isCompound() + && !Dictionary.hasFlag(wordFlags, context.requiredFlag(dictionary))) { + continue; + } + stems.add(newStem(word, offset, length, forms, i)); } } try { stems.addAll( - stem(word, 0, length, -1, (char) 0, -1, 0, true, true, false, false, caseVariant)); + stem( + word, + offset, + length, + context, + -1, + (char) 0, + -1, + 0, + true, + true, + false, + false, + caseVariant)); } catch (IOException bogus) { throw new RuntimeException(bogus); } @@ -287,6 +305,7 @@ final class Stemmer { char[] word, int offset, int length, + WordContext context, int previous, char prevFlag, int prefixId, @@ -328,7 +347,7 @@ final class Stemmer { continue; } - if (isAffixCompatible(prefix, prevFlag, recursionDepth, false)) { + if (isAffixCompatible(prefix, prevFlag, recursionDepth, true, false, context)) { char[] strippedWord = stripAffix(word, offset, length, i, prefix, true); if (strippedWord == null) { continue; @@ -340,6 +359,7 @@ final class Stemmer { strippedWord, pureAffix ? offset + i : 0, pureAffix ? length - i : strippedWord.length, + context, prefix, -1, recursionDepth, @@ -378,7 +398,8 @@ final class Stemmer { continue; } - if (isAffixCompatible(suffix, prevFlag, recursionDepth, previousWasPrefix)) { + if (isAffixCompatible( + suffix, prevFlag, recursionDepth, false, previousWasPrefix, context)) { char[] strippedWord = stripAffix(word, offset, length, length - i, suffix, false); if (strippedWord == null) { continue; @@ -390,6 +411,7 @@ final class Stemmer { strippedWord, pureAffix ? offset : 0, pureAffix ? i : strippedWord.length, + context, suffix, prefixId, recursionDepth, @@ -442,18 +464,31 @@ final class Stemmer { } private boolean isAffixCompatible( - int affix, char prevFlag, int recursionDepth, boolean previousWasPrefix) { + int affix, + char prevFlag, + int recursionDepth, + boolean isPrefix, + boolean previousWasPrefix, + WordContext context) { int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND); + if (context.isCompound() && dictionary.compoundPermit > 0) { + WordContext allowed = isPrefix ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_END; + if (context != allowed && !dictionary.hasFlag(append, dictionary.compoundPermit, scratch)) { + return false; + } + } + if (recursionDepth == 0) { // check if affix is allowed in a non-compound word - return !dictionary.hasFlag(append, dictionary.onlyincompound, scratch); + return context.isCompound() + || !dictionary.hasFlag(append, dictionary.onlyincompound, scratch); } if (isCrossProduct(affix)) { // cross check incoming continuation class (flag of previous affix) against list. char[] appendFlags = dictionary.decodeFlags(append, scratch); - if (!Dictionary.hasFlag(appendFlags, dictionary.onlyincompound)) { + if (context.isCompound() || !Dictionary.hasFlag(appendFlags, dictionary.onlyincompound)) { return previousWasPrefix || Dictionary.hasFlag(appendFlags, prevFlag); } } @@ -491,8 +526,9 @@ final class Stemmer { /** * Applies the affix rule to the given word, producing a list of stems if any are found * - * @param strippedWord Word the affix has been removed and the strip added - * @param length valid length of stripped word + * @param strippedWord Char array containing the word with the affix removed and the strip added + * @param offset where the word actually starts in the array + * @param length the length of the stripped word * @param affix HunspellAffix representing the affix rule itself * @param prefixId when we already stripped a prefix, we can't simply recurse and check the * suffix, unless both are compatible so we must check dictionary form against both to add it @@ -505,6 +541,7 @@ final class Stemmer { char[] strippedWord, int offset, int length, + WordContext context, int affix, int prefixId, int recursionDepth, @@ -546,10 +583,15 @@ final class Stemmer { if (!acceptCase(caseVariant, wordFlags)) { continue; } - // we aren't decompounding (yet) - if (Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) { + if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) { continue; } + if (context.isCompound()) { + char cFlag = context.requiredFlag(dictionary); + if (!Dictionary.hasFlag(wordFlags, cFlag) && !isFlagAppendedByAffix(affix, cFlag)) { + continue; + } + } stems.add(newStem(strippedWord, offset, length, forms, i)); } } @@ -594,6 +636,7 @@ final class Stemmer { strippedWord, offset, length, + context, affix, flag, prefixId, diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordContext.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordContext.java new file mode 100644 index 00000000000..4dd6e0e9928 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordContext.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +enum WordContext { + SIMPLE_WORD, + COMPOUND_BEGIN, + COMPOUND_MIDDLE, + COMPOUND_END; + + boolean isCompound() { + return this != SIMPLE_WORD; + } + + char requiredFlag(Dictionary dictionary) { + switch (this) { + case COMPOUND_BEGIN: + return dictionary.compoundBegin; + case COMPOUND_MIDDLE: + return dictionary.compoundMiddle; + case COMPOUND_END: + return dictionary.compoundEnd; + default: + return Dictionary.FLAG_UNSET; + } + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java index dacf22e40ed..75b76392226 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java @@ -80,6 +80,10 @@ public class SpellCheckerTest extends StemmerTestBase { doTest("compoundrule8"); } + public void testGermanCompounding() throws Exception { + doTest("germancompounding"); + } + protected void doTest(String name) throws Exception { InputStream affixStream = Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.aff new file mode 100644 index 00000000000..5ff25872ce1 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.aff @@ -0,0 +1,91 @@ +# German compounding + +# handle special casing of German sharp s + +CHECKSHARPS + +# compound flags + +COMPOUNDBEGIN U +COMPOUNDMIDDLE V +COMPOUNDEND W + +# Prefixes are allowed at the beginning of compounds, +# suffixes are allowed at the end of compounds by default: +# (prefix)?(root)+(affix)? +# Affixes with COMPOUNDPERMITFLAG may be inside of compounds. +COMPOUNDPERMITFLAG P + +# for German fogemorphemes (Fuge-element) +# Hint: ONLYINCOMPOUND is not required everywhere, but the +# checking will be a little faster with it. + +ONLYINCOMPOUND X + +# forbid uppercase characters at compound word bounds +CHECKCOMPOUNDCASE + +# for handling Fuge-elements with dashes (Arbeits-) +# dash will be a special word + +COMPOUNDMIN 1 +WORDCHARS - + +# compound settings and fogemorpheme for `Arbeit' + +SFX A Y 3 +SFX A 0 s/UPX . +SFX A 0 s/VPDX . +SFX A 0 0/WXD . + +SFX B Y 2 +SFX B 0 0/UPX . +SFX B 0 0/VWXDP . + +# a suffix for `Computer' + +SFX C Y 1 +SFX C 0 n/WD . + +# for forbid exceptions (*Arbeitsnehmer) + +FORBIDDENWORD Z + +# dash prefix for compounds with dash (Arbeits-Computer) + +PFX - Y 1 +PFX - 0 -/P . + +# decapitalizing prefix +# circumfix for positioning in compounds + +PFX D Y 29 +PFX D A a/PX A +PFX D Ä ä/PX Ä +PFX D B b/PX B +PFX D C c/PX C +PFX D D d/PX D +PFX D E e/PX E +PFX D F f/PX F +PFX D G g/PX G +PFX D H h/PX H +PFX D I i/PX I +PFX D J j/PX J +PFX D K k/PX K +PFX D L l/PX L +PFX D M m/PX M +PFX D N n/PX N +PFX D O o/PX O +PFX D Ö ö/PX Ö +PFX D P p/PX P +PFX D Q q/PX Q +PFX D R r/PX R +PFX D S s/PX S +PFX D T t/PX T +PFX D U u/PX U +PFX D Ü ü/PX Ü +PFX D V v/PX V +PFX D W w/PX W +PFX D X x/PX X +PFX D Y y/PX Y +PFX D Z z/PX Z diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.dic new file mode 100644 index 00000000000..5db6783a4de --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.dic @@ -0,0 +1,5 @@ +4 +Arbeit/A- +Computer/BC- +-/W +Arbeitsnehmer/Z diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.good new file mode 100644 index 00000000000..e4945553c5d --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.good @@ -0,0 +1,20 @@ +Computer +Computern +Arbeit +Arbeits- +Computerarbeit +Computerarbeits- +Arbeitscomputer +Computercomputer +Computercomputern +Arbeitscomputern +Computerarbeitscomputer +Computerarbeitscomputern +Arbeitscomputercomputer +Computercomputerarbeit +Arbeitscomputerarbeit +Arbeitsarbeitsarbeit +Computerarbeitsarbeit +Computerarbeits-Computer +Computerarbeits-Computern +Computer-Arbeit diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.wrong new file mode 100644 index 00000000000..c5f2ba11517 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.wrong @@ -0,0 +1,50 @@ +computer +computern +arbeit +Arbeits +arbeits +ComputerArbeit +ComputernArbeit +Computernarbeit +ComputerArbeits +Arbeitcomputer +Arbeitcomputern +ArbeitsComputer +ArbeitsComputern +Computerarbeitcomputer +ComputerArbeitcomputer +ComputerArbeitscomputer +Computerarbeitcomputern +ComputerArbeitcomputern +ComputerArbeitscomputern +Arbeitscomputerarbeits +Arbeitscomputernarbeits +Computerarbeits-computer +Arbeitsnehmer +computers +computern +computernarbeit +computernArbeit +computerArbeit +computerArbeits +arbeitcomputer +arbeitsComputer +computerarbeitcomputer +computerArbeitcomputer +computerArbeitscomputer +arbeitscomputerarbeits +computerarbeits-computer +arbeitsnehmer +computernarbeit +computernArbeit +arbeits- +computerarbeit +computerarbeits- +arbeitscomputer +arbeitscomputern +computerarbeitscomputer +computerarbeitscomputern +computerarbeitscomputers +arbeitscomputerarbeit +computerarbeits-Computer +computerarbeits-Computern