mirror of https://github.com/apache/lucene.git
LUCENE-9699: Support German-like compound words (#2248)
This commit is contained in:
parent
38ec2602ce
commit
a176308aa6
|
@ -141,8 +141,9 @@ public class Dictionary {
|
|||
char keepcase;
|
||||
char needaffix;
|
||||
char forbiddenword;
|
||||
char onlyincompound;
|
||||
int compoundMin = 3;
|
||||
char onlyincompound, compoundBegin, compoundMiddle, compoundEnd, compoundPermit;
|
||||
boolean checkCompoundCase;
|
||||
int compoundMin = 3, compoundMax = Integer.MAX_VALUE;
|
||||
List<CompoundRule> compoundRules; // nullable
|
||||
|
||||
// ignored characters (dictionary, affix, inputs)
|
||||
|
@ -377,8 +378,20 @@ public class Dictionary {
|
|||
forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("COMPOUNDMIN".equals(firstWord)) {
|
||||
compoundMin = Math.max(1, Integer.parseInt(singleArgument(reader, line)));
|
||||
} else if ("COMPOUNDWORDMAX".equals(firstWord)) {
|
||||
compoundMax = Math.max(1, Integer.parseInt(singleArgument(reader, line)));
|
||||
} else if ("COMPOUNDRULE".equals(firstWord)) {
|
||||
compoundRules = parseCompoundRules(reader, Integer.parseInt(singleArgument(reader, line)));
|
||||
} else if ("COMPOUNDBEGIN".equals(firstWord)) {
|
||||
compoundBegin = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("COMPOUNDMIDDLE".equals(firstWord)) {
|
||||
compoundMiddle = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("COMPOUNDEND".equals(firstWord)) {
|
||||
compoundEnd = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("COMPOUNDPERMITFLAG".equals(firstWord)) {
|
||||
compoundPermit = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("CHECKCOMPOUNDCASE".equals(firstWord)) {
|
||||
checkCompoundCase = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1303,10 +1316,6 @@ public class Dictionary {
|
|||
}
|
||||
}
|
||||
|
||||
boolean hasCompounding() {
|
||||
return compoundRules != null;
|
||||
}
|
||||
|
||||
boolean hasFlag(int entryId, char flag, BytesRef scratch) {
|
||||
return flag != FLAG_UNSET && hasFlag(decodeFlags(entryId, scratch), flag);
|
||||
}
|
||||
|
|
|
@ -87,18 +87,54 @@ public class SpellChecker {
|
|||
return false;
|
||||
}
|
||||
|
||||
if (!stemmer.doStem(wordChars, length, caseVariant).isEmpty()) {
|
||||
if (hasStems(wordChars, 0, length, caseVariant, WordContext.SIMPLE_WORD)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (dictionary.hasCompounding()) {
|
||||
return checkCompounds(wordChars, 0, length, new ArrayList<>());
|
||||
if (dictionary.compoundRules != null
|
||||
&& checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return dictionary.compoundBegin > 0 && checkCompounds(wordChars, 0, length, caseVariant, 0);
|
||||
}
|
||||
|
||||
private boolean hasStems(
|
||||
char[] chars, int offset, int length, boolean caseVariant, WordContext context) {
|
||||
return !stemmer.doStem(chars, offset, length, caseVariant, context).isEmpty();
|
||||
}
|
||||
|
||||
private boolean checkCompounds(
|
||||
char[] chars, int offset, int length, boolean caseVariant, int depth) {
|
||||
if (depth > dictionary.compoundMax - 2) return false;
|
||||
|
||||
int limit = length - dictionary.compoundMin + 1;
|
||||
for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
|
||||
WordContext context = depth == 0 ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_MIDDLE;
|
||||
int breakOffset = offset + breakPos;
|
||||
if (checkCompoundCase(chars, breakOffset)
|
||||
&& hasStems(chars, offset, breakPos, caseVariant, context)) {
|
||||
int remainingLength = length - breakPos;
|
||||
if (hasStems(chars, breakOffset, remainingLength, caseVariant, WordContext.COMPOUND_END)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (checkCompounds(chars, breakOffset, remainingLength, caseVariant, depth + 1)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean checkCompounds(char[] wordChars, int offset, int length, List<IntsRef> words) {
|
||||
private boolean checkCompoundCase(char[] chars, int breakPos) {
|
||||
if (!dictionary.checkCompoundCase) return true;
|
||||
return Character.isUpperCase(chars[breakPos - 1]) == Character.isUpperCase(chars[breakPos]);
|
||||
}
|
||||
|
||||
private boolean checkCompoundRules(
|
||||
char[] wordChars, int offset, int length, List<IntsRef> words) {
|
||||
if (words.size() >= 100) return false;
|
||||
|
||||
int limit = length - dictionary.compoundMin + 1;
|
||||
|
@ -113,7 +149,7 @@ public class SpellChecker {
|
|||
return true;
|
||||
}
|
||||
|
||||
if (checkCompounds(wordChars, offset + breakPos, length - breakPos, words)) {
|
||||
if (checkCompoundRules(wordChars, offset + breakPos, length - breakPos, words)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -132,8 +168,7 @@ public class SpellChecker {
|
|||
|
||||
words.add(forms);
|
||||
boolean result =
|
||||
dictionary.compoundRules != null
|
||||
&& dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch));
|
||||
dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch));
|
||||
words.remove(words.size() - 1);
|
||||
return result;
|
||||
}
|
||||
|
|
|
@ -94,18 +94,18 @@ final class Stemmer {
|
|||
}
|
||||
|
||||
WordCase wordCase = caseOf(word, length);
|
||||
List<CharsRef> list = doStem(word, length, false);
|
||||
List<CharsRef> list = doStem(word, 0, length, false, WordContext.SIMPLE_WORD);
|
||||
if (wordCase == WordCase.UPPER) {
|
||||
caseFoldTitle(word, length);
|
||||
char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
|
||||
if (aposCase != null) {
|
||||
list.addAll(doStem(aposCase, length, true));
|
||||
list.addAll(doStem(aposCase, 0, length, true, WordContext.SIMPLE_WORD));
|
||||
}
|
||||
list.addAll(doStem(titleBuffer, length, true));
|
||||
list.addAll(doStem(titleBuffer, 0, length, true, WordContext.SIMPLE_WORD));
|
||||
}
|
||||
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
|
||||
caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
|
||||
list.addAll(doStem(lowerBuffer, length, true));
|
||||
list.addAll(doStem(lowerBuffer, 0, length, true, WordContext.SIMPLE_WORD));
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
@ -158,9 +158,10 @@ final class Stemmer {
|
|||
return null;
|
||||
}
|
||||
|
||||
List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
|
||||
List<CharsRef> doStem(
|
||||
char[] word, int offset, int length, boolean caseVariant, WordContext context) {
|
||||
List<CharsRef> stems = new ArrayList<>();
|
||||
IntsRef forms = dictionary.lookupWord(word, 0, length);
|
||||
IntsRef forms = dictionary.lookupWord(word, offset, length);
|
||||
if (forms != null) {
|
||||
for (int i = 0; i < forms.length; i += formStep) {
|
||||
char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch);
|
||||
|
@ -172,15 +173,32 @@ final class Stemmer {
|
|||
continue;
|
||||
}
|
||||
// we can't add this form, it only belongs inside a compound word
|
||||
if (Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
|
||||
if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
|
||||
continue;
|
||||
}
|
||||
stems.add(newStem(word, 0, length, forms, i));
|
||||
if (context.isCompound()
|
||||
&& !Dictionary.hasFlag(wordFlags, context.requiredFlag(dictionary))) {
|
||||
continue;
|
||||
}
|
||||
stems.add(newStem(word, offset, length, forms, i));
|
||||
}
|
||||
}
|
||||
try {
|
||||
stems.addAll(
|
||||
stem(word, 0, length, -1, (char) 0, -1, 0, true, true, false, false, caseVariant));
|
||||
stem(
|
||||
word,
|
||||
offset,
|
||||
length,
|
||||
context,
|
||||
-1,
|
||||
(char) 0,
|
||||
-1,
|
||||
0,
|
||||
true,
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
caseVariant));
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
|
@ -287,6 +305,7 @@ final class Stemmer {
|
|||
char[] word,
|
||||
int offset,
|
||||
int length,
|
||||
WordContext context,
|
||||
int previous,
|
||||
char prevFlag,
|
||||
int prefixId,
|
||||
|
@ -328,7 +347,7 @@ final class Stemmer {
|
|||
continue;
|
||||
}
|
||||
|
||||
if (isAffixCompatible(prefix, prevFlag, recursionDepth, false)) {
|
||||
if (isAffixCompatible(prefix, prevFlag, recursionDepth, true, false, context)) {
|
||||
char[] strippedWord = stripAffix(word, offset, length, i, prefix, true);
|
||||
if (strippedWord == null) {
|
||||
continue;
|
||||
|
@ -340,6 +359,7 @@ final class Stemmer {
|
|||
strippedWord,
|
||||
pureAffix ? offset + i : 0,
|
||||
pureAffix ? length - i : strippedWord.length,
|
||||
context,
|
||||
prefix,
|
||||
-1,
|
||||
recursionDepth,
|
||||
|
@ -378,7 +398,8 @@ final class Stemmer {
|
|||
continue;
|
||||
}
|
||||
|
||||
if (isAffixCompatible(suffix, prevFlag, recursionDepth, previousWasPrefix)) {
|
||||
if (isAffixCompatible(
|
||||
suffix, prevFlag, recursionDepth, false, previousWasPrefix, context)) {
|
||||
char[] strippedWord = stripAffix(word, offset, length, length - i, suffix, false);
|
||||
if (strippedWord == null) {
|
||||
continue;
|
||||
|
@ -390,6 +411,7 @@ final class Stemmer {
|
|||
strippedWord,
|
||||
pureAffix ? offset : 0,
|
||||
pureAffix ? i : strippedWord.length,
|
||||
context,
|
||||
suffix,
|
||||
prefixId,
|
||||
recursionDepth,
|
||||
|
@ -442,18 +464,31 @@ final class Stemmer {
|
|||
}
|
||||
|
||||
private boolean isAffixCompatible(
|
||||
int affix, char prevFlag, int recursionDepth, boolean previousWasPrefix) {
|
||||
int affix,
|
||||
char prevFlag,
|
||||
int recursionDepth,
|
||||
boolean isPrefix,
|
||||
boolean previousWasPrefix,
|
||||
WordContext context) {
|
||||
int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
|
||||
|
||||
if (context.isCompound() && dictionary.compoundPermit > 0) {
|
||||
WordContext allowed = isPrefix ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_END;
|
||||
if (context != allowed && !dictionary.hasFlag(append, dictionary.compoundPermit, scratch)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (recursionDepth == 0) {
|
||||
// check if affix is allowed in a non-compound word
|
||||
return !dictionary.hasFlag(append, dictionary.onlyincompound, scratch);
|
||||
return context.isCompound()
|
||||
|| !dictionary.hasFlag(append, dictionary.onlyincompound, scratch);
|
||||
}
|
||||
|
||||
if (isCrossProduct(affix)) {
|
||||
// cross check incoming continuation class (flag of previous affix) against list.
|
||||
char[] appendFlags = dictionary.decodeFlags(append, scratch);
|
||||
if (!Dictionary.hasFlag(appendFlags, dictionary.onlyincompound)) {
|
||||
if (context.isCompound() || !Dictionary.hasFlag(appendFlags, dictionary.onlyincompound)) {
|
||||
return previousWasPrefix || Dictionary.hasFlag(appendFlags, prevFlag);
|
||||
}
|
||||
}
|
||||
|
@ -491,8 +526,9 @@ final class Stemmer {
|
|||
/**
|
||||
* Applies the affix rule to the given word, producing a list of stems if any are found
|
||||
*
|
||||
* @param strippedWord Word the affix has been removed and the strip added
|
||||
* @param length valid length of stripped word
|
||||
* @param strippedWord Char array containing the word with the affix removed and the strip added
|
||||
* @param offset where the word actually starts in the array
|
||||
* @param length the length of the stripped word
|
||||
* @param affix HunspellAffix representing the affix rule itself
|
||||
* @param prefixId when we already stripped a prefix, we can't simply recurse and check the
|
||||
* suffix, unless both are compatible so we must check dictionary form against both to add it
|
||||
|
@ -505,6 +541,7 @@ final class Stemmer {
|
|||
char[] strippedWord,
|
||||
int offset,
|
||||
int length,
|
||||
WordContext context,
|
||||
int affix,
|
||||
int prefixId,
|
||||
int recursionDepth,
|
||||
|
@ -546,10 +583,15 @@ final class Stemmer {
|
|||
if (!acceptCase(caseVariant, wordFlags)) {
|
||||
continue;
|
||||
}
|
||||
// we aren't decompounding (yet)
|
||||
if (Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
|
||||
if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
|
||||
continue;
|
||||
}
|
||||
if (context.isCompound()) {
|
||||
char cFlag = context.requiredFlag(dictionary);
|
||||
if (!Dictionary.hasFlag(wordFlags, cFlag) && !isFlagAppendedByAffix(affix, cFlag)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
stems.add(newStem(strippedWord, offset, length, forms, i));
|
||||
}
|
||||
}
|
||||
|
@ -594,6 +636,7 @@ final class Stemmer {
|
|||
strippedWord,
|
||||
offset,
|
||||
length,
|
||||
context,
|
||||
affix,
|
||||
flag,
|
||||
prefixId,
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
enum WordContext {
|
||||
SIMPLE_WORD,
|
||||
COMPOUND_BEGIN,
|
||||
COMPOUND_MIDDLE,
|
||||
COMPOUND_END;
|
||||
|
||||
boolean isCompound() {
|
||||
return this != SIMPLE_WORD;
|
||||
}
|
||||
|
||||
char requiredFlag(Dictionary dictionary) {
|
||||
switch (this) {
|
||||
case COMPOUND_BEGIN:
|
||||
return dictionary.compoundBegin;
|
||||
case COMPOUND_MIDDLE:
|
||||
return dictionary.compoundMiddle;
|
||||
case COMPOUND_END:
|
||||
return dictionary.compoundEnd;
|
||||
default:
|
||||
return Dictionary.FLAG_UNSET;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -80,6 +80,10 @@ public class SpellCheckerTest extends StemmerTestBase {
|
|||
doTest("compoundrule8");
|
||||
}
|
||||
|
||||
public void testGermanCompounding() throws Exception {
|
||||
doTest("germancompounding");
|
||||
}
|
||||
|
||||
protected void doTest(String name) throws Exception {
|
||||
InputStream affixStream =
|
||||
Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name);
|
||||
|
|
|
@ -0,0 +1,91 @@
|
|||
# German compounding
|
||||
|
||||
# handle special casing of German sharp s
|
||||
|
||||
CHECKSHARPS
|
||||
|
||||
# compound flags
|
||||
|
||||
COMPOUNDBEGIN U
|
||||
COMPOUNDMIDDLE V
|
||||
COMPOUNDEND W
|
||||
|
||||
# Prefixes are allowed at the beginning of compounds,
|
||||
# suffixes are allowed at the end of compounds by default:
|
||||
# (prefix)?(root)+(affix)?
|
||||
# Affixes with COMPOUNDPERMITFLAG may be inside of compounds.
|
||||
COMPOUNDPERMITFLAG P
|
||||
|
||||
# for German fogemorphemes (Fuge-element)
|
||||
# Hint: ONLYINCOMPOUND is not required everywhere, but the
|
||||
# checking will be a little faster with it.
|
||||
|
||||
ONLYINCOMPOUND X
|
||||
|
||||
# forbid uppercase characters at compound word bounds
|
||||
CHECKCOMPOUNDCASE
|
||||
|
||||
# for handling Fuge-elements with dashes (Arbeits-)
|
||||
# dash will be a special word
|
||||
|
||||
COMPOUNDMIN 1
|
||||
WORDCHARS -
|
||||
|
||||
# compound settings and fogemorpheme for `Arbeit'
|
||||
|
||||
SFX A Y 3
|
||||
SFX A 0 s/UPX .
|
||||
SFX A 0 s/VPDX .
|
||||
SFX A 0 0/WXD .
|
||||
|
||||
SFX B Y 2
|
||||
SFX B 0 0/UPX .
|
||||
SFX B 0 0/VWXDP .
|
||||
|
||||
# a suffix for `Computer'
|
||||
|
||||
SFX C Y 1
|
||||
SFX C 0 n/WD .
|
||||
|
||||
# for forbid exceptions (*Arbeitsnehmer)
|
||||
|
||||
FORBIDDENWORD Z
|
||||
|
||||
# dash prefix for compounds with dash (Arbeits-Computer)
|
||||
|
||||
PFX - Y 1
|
||||
PFX - 0 -/P .
|
||||
|
||||
# decapitalizing prefix
|
||||
# circumfix for positioning in compounds
|
||||
|
||||
PFX D Y 29
|
||||
PFX D A a/PX A
|
||||
PFX D Ä ä/PX Ä
|
||||
PFX D B b/PX B
|
||||
PFX D C c/PX C
|
||||
PFX D D d/PX D
|
||||
PFX D E e/PX E
|
||||
PFX D F f/PX F
|
||||
PFX D G g/PX G
|
||||
PFX D H h/PX H
|
||||
PFX D I i/PX I
|
||||
PFX D J j/PX J
|
||||
PFX D K k/PX K
|
||||
PFX D L l/PX L
|
||||
PFX D M m/PX M
|
||||
PFX D N n/PX N
|
||||
PFX D O o/PX O
|
||||
PFX D Ö ö/PX Ö
|
||||
PFX D P p/PX P
|
||||
PFX D Q q/PX Q
|
||||
PFX D R r/PX R
|
||||
PFX D S s/PX S
|
||||
PFX D T t/PX T
|
||||
PFX D U u/PX U
|
||||
PFX D Ü ü/PX Ü
|
||||
PFX D V v/PX V
|
||||
PFX D W w/PX W
|
||||
PFX D X x/PX X
|
||||
PFX D Y y/PX Y
|
||||
PFX D Z z/PX Z
|
|
@ -0,0 +1,5 @@
|
|||
4
|
||||
Arbeit/A-
|
||||
Computer/BC-
|
||||
-/W
|
||||
Arbeitsnehmer/Z
|
|
@ -0,0 +1,20 @@
|
|||
Computer
|
||||
Computern
|
||||
Arbeit
|
||||
Arbeits-
|
||||
Computerarbeit
|
||||
Computerarbeits-
|
||||
Arbeitscomputer
|
||||
Computercomputer
|
||||
Computercomputern
|
||||
Arbeitscomputern
|
||||
Computerarbeitscomputer
|
||||
Computerarbeitscomputern
|
||||
Arbeitscomputercomputer
|
||||
Computercomputerarbeit
|
||||
Arbeitscomputerarbeit
|
||||
Arbeitsarbeitsarbeit
|
||||
Computerarbeitsarbeit
|
||||
Computerarbeits-Computer
|
||||
Computerarbeits-Computern
|
||||
Computer-Arbeit
|
|
@ -0,0 +1,50 @@
|
|||
computer
|
||||
computern
|
||||
arbeit
|
||||
Arbeits
|
||||
arbeits
|
||||
ComputerArbeit
|
||||
ComputernArbeit
|
||||
Computernarbeit
|
||||
ComputerArbeits
|
||||
Arbeitcomputer
|
||||
Arbeitcomputern
|
||||
ArbeitsComputer
|
||||
ArbeitsComputern
|
||||
Computerarbeitcomputer
|
||||
ComputerArbeitcomputer
|
||||
ComputerArbeitscomputer
|
||||
Computerarbeitcomputern
|
||||
ComputerArbeitcomputern
|
||||
ComputerArbeitscomputern
|
||||
Arbeitscomputerarbeits
|
||||
Arbeitscomputernarbeits
|
||||
Computerarbeits-computer
|
||||
Arbeitsnehmer
|
||||
computers
|
||||
computern
|
||||
computernarbeit
|
||||
computernArbeit
|
||||
computerArbeit
|
||||
computerArbeits
|
||||
arbeitcomputer
|
||||
arbeitsComputer
|
||||
computerarbeitcomputer
|
||||
computerArbeitcomputer
|
||||
computerArbeitscomputer
|
||||
arbeitscomputerarbeits
|
||||
computerarbeits-computer
|
||||
arbeitsnehmer
|
||||
computernarbeit
|
||||
computernArbeit
|
||||
arbeits-
|
||||
computerarbeit
|
||||
computerarbeits-
|
||||
arbeitscomputer
|
||||
arbeitscomputern
|
||||
computerarbeitscomputer
|
||||
computerarbeitscomputern
|
||||
computerarbeitscomputers
|
||||
arbeitscomputerarbeit
|
||||
computerarbeits-Computer
|
||||
computerarbeits-Computern
|
Loading…
Reference in New Issue