LUCENE-9710: Hunspell: support minor compounding-related flags (#2272)

* LUCENE-9710: Hunspell: support COMPOUNDFLAG

* LUCENE-9710: Hunspell: fix CHECKCOMPOUNDCASE support

* LUCENE-9710: Hunspell: support CHECKCOMPOUNDDUP

* LUCENE-9710: Hunspell: support triple flags (CHECKCOMPOUNDTRIPLE, SIMPLIFIEDTRIPLE)

* LUCENE-9710: Hunspell: support COMPOUNDFORBIDFLAG

* LUCENE-9710: Hunspell: support FORCEUCASE
This commit is contained in:
Peter Gromov 2021-02-01 10:20:11 +01:00 committed by GitHub
parent 40e92315ae
commit 9d45dfe776
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
32 changed files with 247 additions and 21 deletions

View File

@ -139,11 +139,13 @@ public class Dictionary {
boolean twoStageAffix;
char circumfix;
char keepcase;
char keepcase, forceUCase;
char needaffix;
char forbiddenword;
char onlyincompound, compoundBegin, compoundMiddle, compoundEnd, compoundPermit;
boolean checkCompoundCase;
char onlyincompound, compoundBegin, compoundMiddle, compoundEnd, compoundFlag;
char compoundPermit, compoundForbid;
boolean checkCompoundCase, checkCompoundDup;
boolean checkCompoundTriple, simplifiedTriple;
int compoundMin = 3, compoundMax = Integer.MAX_VALUE;
List<CompoundRule> compoundRules; // nullable
@ -350,6 +352,8 @@ public class Dictionary {
circumfix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("KEEPCASE".equals(firstWord)) {
keepcase = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("FORCEUCASE".equals(firstWord)) {
forceUCase = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("NEEDAFFIX".equals(firstWord) || "PSEUDOROOT".equals(firstWord)) {
needaffix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("ONLYINCOMPOUND".equals(firstWord)) {
@ -387,6 +391,8 @@ public class Dictionary {
compoundMax = Math.max(1, Integer.parseInt(singleArgument(reader, line)));
} else if ("COMPOUNDRULE".equals(firstWord)) {
compoundRules = parseCompoundRules(reader, Integer.parseInt(singleArgument(reader, line)));
} else if ("COMPOUNDFLAG".equals(firstWord)) {
compoundFlag = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("COMPOUNDBEGIN".equals(firstWord)) {
compoundBegin = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("COMPOUNDMIDDLE".equals(firstWord)) {
@ -395,8 +401,16 @@ public class Dictionary {
compoundEnd = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("COMPOUNDPERMITFLAG".equals(firstWord)) {
compoundPermit = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("COMPOUNDFORBIDFLAG".equals(firstWord)) {
compoundForbid = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("CHECKCOMPOUNDCASE".equals(firstWord)) {
checkCompoundCase = true;
} else if ("CHECKCOMPOUNDDUP".equals(firstWord)) {
checkCompoundDup = true;
} else if ("CHECKCOMPOUNDTRIPLE".equals(firstWord)) {
checkCompoundTriple = true;
} else if ("SIMPLIFIEDTRIPLE".equals(firstWord)) {
simplifiedTriple = true;
}
}

View File

@ -16,9 +16,16 @@
*/
package org.apache.lucene.analysis.hunspell;
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
/**
@ -126,7 +133,7 @@ public class SpellChecker {
return false;
}
if (hasStems(wordChars, 0, length, originalCase, WordContext.SIMPLE_WORD)) {
if (!stemmer.doStem(wordChars, 0, length, originalCase, SIMPLE_WORD).isEmpty()) {
return true;
}
@ -135,12 +142,11 @@ public class SpellChecker {
return true;
}
return dictionary.compoundBegin > 0 && checkCompounds(wordChars, 0, length, originalCase, 0);
}
if (dictionary.compoundBegin != FLAG_UNSET || dictionary.compoundFlag != FLAG_UNSET) {
return checkCompounds(wordChars, 0, length, originalCase, 0);
}
private boolean hasStems(
char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
return !stemmer.doStem(chars, offset, length, originalCase, context).isEmpty();
return false;
}
private boolean checkCompounds(
@ -149,12 +155,23 @@ public class SpellChecker {
int limit = length - dictionary.compoundMin + 1;
for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
WordContext context = depth == 0 ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_MIDDLE;
WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
int breakOffset = offset + breakPos;
if (checkCompoundCase(chars, breakOffset)
&& hasStems(chars, offset, breakPos, originalCase, context)) {
if (mayBreakIntoCompounds(chars, offset, length, breakOffset)) {
List<CharsRef> stems = stemmer.doStem(chars, offset, breakPos, originalCase, context);
if (stems.isEmpty()
&& dictionary.simplifiedTriple
&& chars[breakOffset - 1] == chars[breakOffset]) {
stems = stemmer.doStem(chars, offset, breakPos + 1, originalCase, context);
}
if (stems.isEmpty()) continue;
int remainingLength = length - breakPos;
if (hasStems(chars, breakOffset, remainingLength, originalCase, WordContext.COMPOUND_END)) {
List<CharsRef> lastStems =
stemmer.doStem(chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
if (!lastStems.isEmpty()
&& !(dictionary.checkCompoundDup && intersectIgnoreCase(stems, lastStems))
&& !hasForceUCaseProblem(chars, breakOffset, remainingLength, originalCase)) {
return true;
}
@ -167,9 +184,37 @@ public class SpellChecker {
return false;
}
private boolean checkCompoundCase(char[] chars, int breakPos) {
if (!dictionary.checkCompoundCase) return true;
return Character.isUpperCase(chars[breakPos - 1]) == Character.isUpperCase(chars[breakPos]);
private boolean hasForceUCaseProblem(
char[] chars, int offset, int length, WordCase originalCase) {
if (dictionary.forceUCase == FLAG_UNSET) return false;
if (originalCase == WordCase.TITLE || originalCase == WordCase.UPPER) return false;
IntsRef forms = dictionary.lookupWord(chars, offset, length);
return forms != null && dictionary.hasFlag(forms, dictionary.forceUCase, scratch);
}
private boolean intersectIgnoreCase(List<CharsRef> stems1, List<CharsRef> stems2) {
return stems1.stream().anyMatch(s1 -> stems2.stream().anyMatch(s2 -> equalsIgnoreCase(s1, s2)));
}
private boolean equalsIgnoreCase(CharsRef cr1, CharsRef cr2) {
return cr1.toString().equalsIgnoreCase(cr2.toString());
}
private boolean mayBreakIntoCompounds(char[] chars, int offset, int length, int breakPos) {
if (dictionary.checkCompoundCase) {
if (Character.isUpperCase(chars[breakPos - 1]) || Character.isUpperCase(chars[breakPos])) {
return false;
}
}
if (dictionary.checkCompoundTriple && chars[breakPos - 1] == chars[breakPos]) {
//noinspection RedundantIfStatement
if (breakPos > offset + 1 && chars[breakPos - 2] == chars[breakPos - 1]
|| breakPos < length - 1 && chars[breakPos] == chars[breakPos + 1]) {
return false;
}
}
return true;
}
private boolean checkCompoundRules(

View File

@ -238,9 +238,15 @@ final class Stemmer {
if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
continue;
}
if (context.isCompound()
&& !Dictionary.hasFlag(wordFlags, context.requiredFlag(dictionary))) {
continue;
if (context.isCompound()) {
if (context != WordContext.COMPOUND_END
&& Dictionary.hasFlag(wordFlags, dictionary.compoundForbid)) {
return new ArrayList<>();
}
if (!Dictionary.hasFlag(wordFlags, dictionary.compoundFlag)
&& !Dictionary.hasFlag(wordFlags, context.requiredFlag(dictionary))) {
continue;
}
}
stems.add(newStem(word, offset, length, forms, i));
}
@ -554,7 +560,10 @@ final class Stemmer {
WordContext context) {
int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
if (context.isCompound() && dictionary.compoundPermit > 0) {
if (context.isCompound()) {
if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid, scratch)) {
return false;
}
WordContext allowed = isPrefix ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_END;
if (context != allowed && !dictionary.hasFlag(append, dictionary.compoundPermit, scratch)) {
return false;
@ -672,7 +681,10 @@ final class Stemmer {
}
if (context.isCompound()) {
char cFlag = context.requiredFlag(dictionary);
if (!Dictionary.hasFlag(wordFlags, cFlag) && !isFlagAppendedByAffix(affix, cFlag)) {
if (!Dictionary.hasFlag(wordFlags, cFlag)
&& !isFlagAppendedByAffix(affix, cFlag)
&& !Dictionary.hasFlag(wordFlags, dictionary.compoundFlag)
&& !isFlagAppendedByAffix(affix, dictionary.compoundFlag)) {
continue;
}
}

View File

@ -46,6 +46,11 @@ public class SpellCheckerTest extends StemmerTestBase {
doTest("allcaps");
}
@Test
public void forceUCase() throws Exception {
doTest("forceucase");
}
@Test
public void checkSharpS() throws Exception {
doTest("checksharps");
@ -71,6 +76,36 @@ public class SpellCheckerTest extends StemmerTestBase {
doTest("needaffix5");
}
@Test
public void compoundFlag() throws Exception {
doTest("compoundflag");
}
@Test
public void checkCompoundCase() throws Exception {
doTest("checkcompoundcase");
}
@Test
public void checkCompoundDup() throws Exception {
doTest("checkcompounddup");
}
@Test
public void checkCompoundTriple() throws Exception {
doTest("checkcompoundtriple");
}
@Test
public void simplifiedTriple() throws Exception {
doTest("simplifiedtriple");
}
@Test
public void compoundForbid() throws Exception {
doTest("compoundforbid");
}
public void testBreak() throws Exception {
doTest("break");
}

View File

@ -0,0 +1,3 @@
# forbid upper case letters at word bounds in compounding
CHECKCOMPOUNDCASE
COMPOUNDFLAG A

View File

@ -0,0 +1,5 @@
4
foo/A
Bar/A
BAZ/A
-/A

View File

@ -0,0 +1,5 @@
Barfoo
foo-Bar
foo-BAZ
BAZ-foo
BAZ-Bar

View File

@ -0,0 +1,3 @@
fooBar
BAZBar
BAZfoo

View File

@ -0,0 +1,3 @@
# Forbid compound word with triple letters
CHECKCOMPOUNDDUP
COMPOUNDFLAG A

View File

@ -0,0 +1,5 @@
barfoo
foobar
foofoobar
foobarfoo
barfoobarfoo

View File

@ -0,0 +1,3 @@
foofoo
foofoofoo
foobarbar

View File

@ -0,0 +1,3 @@
# Forbid compound word with triple letters
CHECKCOMPOUNDTRIPLE
COMPOUNDFLAG A

View File

@ -0,0 +1,5 @@
4
foo/A
opera/A
eel/A
bare/A

View File

@ -0,0 +1,6 @@
operafoo
operaeel
operabare
eelbare
eelfoo
eelopera

View File

@ -0,0 +1,3 @@
COMPOUNDMIN 3
COMPOUNDFLAG A

View File

@ -0,0 +1,5 @@
4
foo/A
bar/A
xy/A
yz/A

View File

@ -0,0 +1,3 @@
foobar
barfoo
foobarfoo

View File

@ -0,0 +1,4 @@
xyyz
fooxy
xyfoo
fooxybar

View File

@ -0,0 +1,15 @@
# Dictionary words with COMPOUNDFORBIDFLAG are
# removed from the beginning and middle of
# compound words, overriding the effect of
# COMPOUNDPERMITFLAG.
#
# See compoundaffix3 test for basic usage
# of COMPOUNDFORBIDFLAG.
COMPOUNDFLAG X
COMPOUNDPERMITFLAG Y
COMPOUNDFORBIDFLAG Z
SFX S Y 2
SFX S 0 bar/YX .
SFX S 0 baz/YX .

View File

@ -0,0 +1,4 @@
3
foo/S
example/X
foobaz/Z

View File

@ -0,0 +1,5 @@
foo
example
foobar
foobaz
foobarexample

View File

@ -0,0 +1,3 @@
fooexample
examplefoo
foobazexample

View File

@ -0,0 +1,4 @@
# force capitalized compound
TRY F
FORCEUCASE A
COMPOUNDFLAG C

View File

@ -0,0 +1,4 @@
3
foo/C
bar/C
baz/CA

View File

@ -0,0 +1,7 @@
foo
bar
baz
foobar
Foobaz
foobazbar
Foobarbaz

View File

@ -0,0 +1,2 @@
foobaz
foobarbaz

View File

@ -0,0 +1,8 @@
# Forbid compound word with triple letters
CHECKCOMPOUNDTRIPLE
# Allow simplified forms
SIMPLIFIEDTRIPLE
COMPOUNDMIN 2
COMPOUNDFLAG A

View File

@ -0,0 +1,3 @@
glass
sko
glassko