LUCENE-9699: Support German-like compound words (#2248)

This commit is contained in:
Peter Gromov 2021-01-27 22:31:58 +01:00 committed by GitHub
parent 38ec2602ce
commit a176308aa6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 329 additions and 31 deletions

View File

@ -141,8 +141,9 @@ public class Dictionary {
char keepcase; char keepcase;
char needaffix; char needaffix;
char forbiddenword; char forbiddenword;
char onlyincompound; char onlyincompound, compoundBegin, compoundMiddle, compoundEnd, compoundPermit;
int compoundMin = 3; boolean checkCompoundCase;
int compoundMin = 3, compoundMax = Integer.MAX_VALUE;
List<CompoundRule> compoundRules; // nullable List<CompoundRule> compoundRules; // nullable
// ignored characters (dictionary, affix, inputs) // ignored characters (dictionary, affix, inputs)
@ -377,8 +378,20 @@ public class Dictionary {
forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line)); forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("COMPOUNDMIN".equals(firstWord)) { } else if ("COMPOUNDMIN".equals(firstWord)) {
compoundMin = Math.max(1, Integer.parseInt(singleArgument(reader, line))); compoundMin = Math.max(1, Integer.parseInt(singleArgument(reader, line)));
} else if ("COMPOUNDWORDMAX".equals(firstWord)) {
compoundMax = Math.max(1, Integer.parseInt(singleArgument(reader, line)));
} else if ("COMPOUNDRULE".equals(firstWord)) { } else if ("COMPOUNDRULE".equals(firstWord)) {
compoundRules = parseCompoundRules(reader, Integer.parseInt(singleArgument(reader, line))); compoundRules = parseCompoundRules(reader, Integer.parseInt(singleArgument(reader, line)));
} else if ("COMPOUNDBEGIN".equals(firstWord)) {
compoundBegin = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("COMPOUNDMIDDLE".equals(firstWord)) {
compoundMiddle = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("COMPOUNDEND".equals(firstWord)) {
compoundEnd = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("COMPOUNDPERMITFLAG".equals(firstWord)) {
compoundPermit = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("CHECKCOMPOUNDCASE".equals(firstWord)) {
checkCompoundCase = true;
} }
} }
@ -1303,10 +1316,6 @@ public class Dictionary {
} }
} }
boolean hasCompounding() {
return compoundRules != null;
}
boolean hasFlag(int entryId, char flag, BytesRef scratch) { boolean hasFlag(int entryId, char flag, BytesRef scratch) {
return flag != FLAG_UNSET && hasFlag(decodeFlags(entryId, scratch), flag); return flag != FLAG_UNSET && hasFlag(decodeFlags(entryId, scratch), flag);
} }

View File

@ -87,18 +87,54 @@ public class SpellChecker {
return false; return false;
} }
if (!stemmer.doStem(wordChars, length, caseVariant).isEmpty()) { if (hasStems(wordChars, 0, length, caseVariant, WordContext.SIMPLE_WORD)) {
return true; return true;
} }
if (dictionary.hasCompounding()) { if (dictionary.compoundRules != null
return checkCompounds(wordChars, 0, length, new ArrayList<>()); && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
return true;
}
return dictionary.compoundBegin > 0 && checkCompounds(wordChars, 0, length, caseVariant, 0);
}
private boolean hasStems(
char[] chars, int offset, int length, boolean caseVariant, WordContext context) {
return !stemmer.doStem(chars, offset, length, caseVariant, context).isEmpty();
}
private boolean checkCompounds(
char[] chars, int offset, int length, boolean caseVariant, int depth) {
if (depth > dictionary.compoundMax - 2) return false;
int limit = length - dictionary.compoundMin + 1;
for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
WordContext context = depth == 0 ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_MIDDLE;
int breakOffset = offset + breakPos;
if (checkCompoundCase(chars, breakOffset)
&& hasStems(chars, offset, breakPos, caseVariant, context)) {
int remainingLength = length - breakPos;
if (hasStems(chars, breakOffset, remainingLength, caseVariant, WordContext.COMPOUND_END)) {
return true;
}
if (checkCompounds(chars, breakOffset, remainingLength, caseVariant, depth + 1)) {
return true;
}
}
} }
return false; return false;
} }
private boolean checkCompounds(char[] wordChars, int offset, int length, List<IntsRef> words) { private boolean checkCompoundCase(char[] chars, int breakPos) {
if (!dictionary.checkCompoundCase) return true;
return Character.isUpperCase(chars[breakPos - 1]) == Character.isUpperCase(chars[breakPos]);
}
private boolean checkCompoundRules(
char[] wordChars, int offset, int length, List<IntsRef> words) {
if (words.size() >= 100) return false; if (words.size() >= 100) return false;
int limit = length - dictionary.compoundMin + 1; int limit = length - dictionary.compoundMin + 1;
@ -113,7 +149,7 @@ public class SpellChecker {
return true; return true;
} }
if (checkCompounds(wordChars, offset + breakPos, length - breakPos, words)) { if (checkCompoundRules(wordChars, offset + breakPos, length - breakPos, words)) {
return true; return true;
} }
} }
@ -132,8 +168,7 @@ public class SpellChecker {
words.add(forms); words.add(forms);
boolean result = boolean result =
dictionary.compoundRules != null dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch));
&& dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch));
words.remove(words.size() - 1); words.remove(words.size() - 1);
return result; return result;
} }

View File

@ -94,18 +94,18 @@ final class Stemmer {
} }
WordCase wordCase = caseOf(word, length); WordCase wordCase = caseOf(word, length);
List<CharsRef> list = doStem(word, length, false); List<CharsRef> list = doStem(word, 0, length, false, WordContext.SIMPLE_WORD);
if (wordCase == WordCase.UPPER) { if (wordCase == WordCase.UPPER) {
caseFoldTitle(word, length); caseFoldTitle(word, length);
char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length); char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
if (aposCase != null) { if (aposCase != null) {
list.addAll(doStem(aposCase, length, true)); list.addAll(doStem(aposCase, 0, length, true, WordContext.SIMPLE_WORD));
} }
list.addAll(doStem(titleBuffer, length, true)); list.addAll(doStem(titleBuffer, 0, length, true, WordContext.SIMPLE_WORD));
} }
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) { if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length); caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
list.addAll(doStem(lowerBuffer, length, true)); list.addAll(doStem(lowerBuffer, 0, length, true, WordContext.SIMPLE_WORD));
} }
return list; return list;
} }
@ -158,9 +158,10 @@ final class Stemmer {
return null; return null;
} }
List<CharsRef> doStem(char[] word, int length, boolean caseVariant) { List<CharsRef> doStem(
char[] word, int offset, int length, boolean caseVariant, WordContext context) {
List<CharsRef> stems = new ArrayList<>(); List<CharsRef> stems = new ArrayList<>();
IntsRef forms = dictionary.lookupWord(word, 0, length); IntsRef forms = dictionary.lookupWord(word, offset, length);
if (forms != null) { if (forms != null) {
for (int i = 0; i < forms.length; i += formStep) { for (int i = 0; i < forms.length; i += formStep) {
char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch); char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch);
@ -172,15 +173,32 @@ final class Stemmer {
continue; continue;
} }
// we can't add this form, it only belongs inside a compound word // we can't add this form, it only belongs inside a compound word
if (Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) { if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
continue; continue;
} }
stems.add(newStem(word, 0, length, forms, i)); if (context.isCompound()
&& !Dictionary.hasFlag(wordFlags, context.requiredFlag(dictionary))) {
continue;
}
stems.add(newStem(word, offset, length, forms, i));
} }
} }
try { try {
stems.addAll( stems.addAll(
stem(word, 0, length, -1, (char) 0, -1, 0, true, true, false, false, caseVariant)); stem(
word,
offset,
length,
context,
-1,
(char) 0,
-1,
0,
true,
true,
false,
false,
caseVariant));
} catch (IOException bogus) { } catch (IOException bogus) {
throw new RuntimeException(bogus); throw new RuntimeException(bogus);
} }
@ -287,6 +305,7 @@ final class Stemmer {
char[] word, char[] word,
int offset, int offset,
int length, int length,
WordContext context,
int previous, int previous,
char prevFlag, char prevFlag,
int prefixId, int prefixId,
@ -328,7 +347,7 @@ final class Stemmer {
continue; continue;
} }
if (isAffixCompatible(prefix, prevFlag, recursionDepth, false)) { if (isAffixCompatible(prefix, prevFlag, recursionDepth, true, false, context)) {
char[] strippedWord = stripAffix(word, offset, length, i, prefix, true); char[] strippedWord = stripAffix(word, offset, length, i, prefix, true);
if (strippedWord == null) { if (strippedWord == null) {
continue; continue;
@ -340,6 +359,7 @@ final class Stemmer {
strippedWord, strippedWord,
pureAffix ? offset + i : 0, pureAffix ? offset + i : 0,
pureAffix ? length - i : strippedWord.length, pureAffix ? length - i : strippedWord.length,
context,
prefix, prefix,
-1, -1,
recursionDepth, recursionDepth,
@ -378,7 +398,8 @@ final class Stemmer {
continue; continue;
} }
if (isAffixCompatible(suffix, prevFlag, recursionDepth, previousWasPrefix)) { if (isAffixCompatible(
suffix, prevFlag, recursionDepth, false, previousWasPrefix, context)) {
char[] strippedWord = stripAffix(word, offset, length, length - i, suffix, false); char[] strippedWord = stripAffix(word, offset, length, length - i, suffix, false);
if (strippedWord == null) { if (strippedWord == null) {
continue; continue;
@ -390,6 +411,7 @@ final class Stemmer {
strippedWord, strippedWord,
pureAffix ? offset : 0, pureAffix ? offset : 0,
pureAffix ? i : strippedWord.length, pureAffix ? i : strippedWord.length,
context,
suffix, suffix,
prefixId, prefixId,
recursionDepth, recursionDepth,
@ -442,18 +464,31 @@ final class Stemmer {
} }
private boolean isAffixCompatible( private boolean isAffixCompatible(
int affix, char prevFlag, int recursionDepth, boolean previousWasPrefix) { int affix,
char prevFlag,
int recursionDepth,
boolean isPrefix,
boolean previousWasPrefix,
WordContext context) {
int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND); int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
if (context.isCompound() && dictionary.compoundPermit > 0) {
WordContext allowed = isPrefix ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_END;
if (context != allowed && !dictionary.hasFlag(append, dictionary.compoundPermit, scratch)) {
return false;
}
}
if (recursionDepth == 0) { if (recursionDepth == 0) {
// check if affix is allowed in a non-compound word // check if affix is allowed in a non-compound word
return !dictionary.hasFlag(append, dictionary.onlyincompound, scratch); return context.isCompound()
|| !dictionary.hasFlag(append, dictionary.onlyincompound, scratch);
} }
if (isCrossProduct(affix)) { if (isCrossProduct(affix)) {
// cross check incoming continuation class (flag of previous affix) against list. // cross check incoming continuation class (flag of previous affix) against list.
char[] appendFlags = dictionary.decodeFlags(append, scratch); char[] appendFlags = dictionary.decodeFlags(append, scratch);
if (!Dictionary.hasFlag(appendFlags, dictionary.onlyincompound)) { if (context.isCompound() || !Dictionary.hasFlag(appendFlags, dictionary.onlyincompound)) {
return previousWasPrefix || Dictionary.hasFlag(appendFlags, prevFlag); return previousWasPrefix || Dictionary.hasFlag(appendFlags, prevFlag);
} }
} }
@ -491,8 +526,9 @@ final class Stemmer {
/** /**
* Applies the affix rule to the given word, producing a list of stems if any are found * Applies the affix rule to the given word, producing a list of stems if any are found
* *
* @param strippedWord Word the affix has been removed and the strip added * @param strippedWord Char array containing the word with the affix removed and the strip added
* @param length valid length of stripped word * @param offset where the word actually starts in the array
* @param length the length of the stripped word
* @param affix HunspellAffix representing the affix rule itself * @param affix HunspellAffix representing the affix rule itself
* @param prefixId when we already stripped a prefix, we can't simply recurse and check the * @param prefixId when we already stripped a prefix, we can't simply recurse and check the
* suffix, unless both are compatible so we must check dictionary form against both to add it * suffix, unless both are compatible so we must check dictionary form against both to add it
@ -505,6 +541,7 @@ final class Stemmer {
char[] strippedWord, char[] strippedWord,
int offset, int offset,
int length, int length,
WordContext context,
int affix, int affix,
int prefixId, int prefixId,
int recursionDepth, int recursionDepth,
@ -546,10 +583,15 @@ final class Stemmer {
if (!acceptCase(caseVariant, wordFlags)) { if (!acceptCase(caseVariant, wordFlags)) {
continue; continue;
} }
// we aren't decompounding (yet) if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
if (Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
continue; continue;
} }
if (context.isCompound()) {
char cFlag = context.requiredFlag(dictionary);
if (!Dictionary.hasFlag(wordFlags, cFlag) && !isFlagAppendedByAffix(affix, cFlag)) {
continue;
}
}
stems.add(newStem(strippedWord, offset, length, forms, i)); stems.add(newStem(strippedWord, offset, length, forms, i));
} }
} }
@ -594,6 +636,7 @@ final class Stemmer {
strippedWord, strippedWord,
offset, offset,
length, length,
context,
affix, affix,
flag, flag,
prefixId, prefixId,

View File

@ -0,0 +1,41 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
enum WordContext {
SIMPLE_WORD,
COMPOUND_BEGIN,
COMPOUND_MIDDLE,
COMPOUND_END;
boolean isCompound() {
return this != SIMPLE_WORD;
}
char requiredFlag(Dictionary dictionary) {
switch (this) {
case COMPOUND_BEGIN:
return dictionary.compoundBegin;
case COMPOUND_MIDDLE:
return dictionary.compoundMiddle;
case COMPOUND_END:
return dictionary.compoundEnd;
default:
return Dictionary.FLAG_UNSET;
}
}
}

View File

@ -80,6 +80,10 @@ public class SpellCheckerTest extends StemmerTestBase {
doTest("compoundrule8"); doTest("compoundrule8");
} }
public void testGermanCompounding() throws Exception {
doTest("germancompounding");
}
protected void doTest(String name) throws Exception { protected void doTest(String name) throws Exception {
InputStream affixStream = InputStream affixStream =
Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name); Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name);

View File

@ -0,0 +1,91 @@
# German compounding
# handle special casing of German sharp s
CHECKSHARPS
# compound flags
COMPOUNDBEGIN U
COMPOUNDMIDDLE V
COMPOUNDEND W
# Prefixes are allowed at the beginning of compounds,
# suffixes are allowed at the end of compounds by default:
# (prefix)?(root)+(affix)?
# Affixes with COMPOUNDPERMITFLAG may be inside of compounds.
COMPOUNDPERMITFLAG P
# for German fogemorphemes (Fuge-element)
# Hint: ONLYINCOMPOUND is not required everywhere, but the
# checking will be a little faster with it.
ONLYINCOMPOUND X
# forbid uppercase characters at compound word bounds
CHECKCOMPOUNDCASE
# for handling Fuge-elements with dashes (Arbeits-)
# dash will be a special word
COMPOUNDMIN 1
WORDCHARS -
# compound settings and fogemorpheme for `Arbeit'
SFX A Y 3
SFX A 0 s/UPX .
SFX A 0 s/VPDX .
SFX A 0 0/WXD .
SFX B Y 2
SFX B 0 0/UPX .
SFX B 0 0/VWXDP .
# a suffix for `Computer'
SFX C Y 1
SFX C 0 n/WD .
# for forbid exceptions (*Arbeitsnehmer)
FORBIDDENWORD Z
# dash prefix for compounds with dash (Arbeits-Computer)
PFX - Y 1
PFX - 0 -/P .
# decapitalizing prefix
# circumfix for positioning in compounds
PFX D Y 29
PFX D A a/PX A
PFX D Ä ä/PX Ä
PFX D B b/PX B
PFX D C c/PX C
PFX D D d/PX D
PFX D E e/PX E
PFX D F f/PX F
PFX D G g/PX G
PFX D H h/PX H
PFX D I i/PX I
PFX D J j/PX J
PFX D K k/PX K
PFX D L l/PX L
PFX D M m/PX M
PFX D N n/PX N
PFX D O o/PX O
PFX D Ö ö/PX Ö
PFX D P p/PX P
PFX D Q q/PX Q
PFX D R r/PX R
PFX D S s/PX S
PFX D T t/PX T
PFX D U u/PX U
PFX D Ü ü/PX Ü
PFX D V v/PX V
PFX D W w/PX W
PFX D X x/PX X
PFX D Y y/PX Y
PFX D Z z/PX Z

View File

@ -0,0 +1,5 @@
4
Arbeit/A-
Computer/BC-
-/W
Arbeitsnehmer/Z

View File

@ -0,0 +1,20 @@
Computer
Computern
Arbeit
Arbeits-
Computerarbeit
Computerarbeits-
Arbeitscomputer
Computercomputer
Computercomputern
Arbeitscomputern
Computerarbeitscomputer
Computerarbeitscomputern
Arbeitscomputercomputer
Computercomputerarbeit
Arbeitscomputerarbeit
Arbeitsarbeitsarbeit
Computerarbeitsarbeit
Computerarbeits-Computer
Computerarbeits-Computern
Computer-Arbeit

View File

@ -0,0 +1,50 @@
computer
computern
arbeit
Arbeits
arbeits
ComputerArbeit
ComputernArbeit
Computernarbeit
ComputerArbeits
Arbeitcomputer
Arbeitcomputern
ArbeitsComputer
ArbeitsComputern
Computerarbeitcomputer
ComputerArbeitcomputer
ComputerArbeitscomputer
Computerarbeitcomputern
ComputerArbeitcomputern
ComputerArbeitscomputern
Arbeitscomputerarbeits
Arbeitscomputernarbeits
Computerarbeits-computer
Arbeitsnehmer
computers
computern
computernarbeit
computernArbeit
computerArbeit
computerArbeits
arbeitcomputer
arbeitsComputer
computerarbeitcomputer
computerArbeitcomputer
computerArbeitscomputer
arbeitscomputerarbeits
computerarbeits-computer
arbeitsnehmer
computernarbeit
computernArbeit
arbeits-
computerarbeit
computerarbeits-
arbeitscomputer
arbeitscomputern
computerarbeitscomputer
computerarbeitscomputern
computerarbeitscomputers
arbeitscomputerarbeit
computerarbeits-Computer
computerarbeits-Computern