mirror of https://github.com/apache/lucene.git
LUCENE-9710: Hunspell: support minor compounding-related flags (#2272)
* LUCENE-9710: Hunspell: support COMPOUNDFLAG * LUCENE-9710: Hunspell: fix CHECKCOMPOUNDCASE support * LUCENE-9710: Hunspell: support CHECKCOMPOUNDDUP * LUCENE-9710: Hunspell: support triple flags (CHECKCOMPOUNDTRIPLE, SIMPLIFIEDTRIPLE) * LUCENE-9710: Hunspell: support COMPOUNDFORBIDFLAG * LUCENE-9710: Hunspell: support FORCEUCASE
This commit is contained in:
parent
40e92315ae
commit
9d45dfe776
|
@ -139,11 +139,13 @@ public class Dictionary {
|
|||
boolean twoStageAffix;
|
||||
|
||||
char circumfix;
|
||||
char keepcase;
|
||||
char keepcase, forceUCase;
|
||||
char needaffix;
|
||||
char forbiddenword;
|
||||
char onlyincompound, compoundBegin, compoundMiddle, compoundEnd, compoundPermit;
|
||||
boolean checkCompoundCase;
|
||||
char onlyincompound, compoundBegin, compoundMiddle, compoundEnd, compoundFlag;
|
||||
char compoundPermit, compoundForbid;
|
||||
boolean checkCompoundCase, checkCompoundDup;
|
||||
boolean checkCompoundTriple, simplifiedTriple;
|
||||
int compoundMin = 3, compoundMax = Integer.MAX_VALUE;
|
||||
List<CompoundRule> compoundRules; // nullable
|
||||
|
||||
|
@ -350,6 +352,8 @@ public class Dictionary {
|
|||
circumfix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("KEEPCASE".equals(firstWord)) {
|
||||
keepcase = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("FORCEUCASE".equals(firstWord)) {
|
||||
forceUCase = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("NEEDAFFIX".equals(firstWord) || "PSEUDOROOT".equals(firstWord)) {
|
||||
needaffix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("ONLYINCOMPOUND".equals(firstWord)) {
|
||||
|
@ -387,6 +391,8 @@ public class Dictionary {
|
|||
compoundMax = Math.max(1, Integer.parseInt(singleArgument(reader, line)));
|
||||
} else if ("COMPOUNDRULE".equals(firstWord)) {
|
||||
compoundRules = parseCompoundRules(reader, Integer.parseInt(singleArgument(reader, line)));
|
||||
} else if ("COMPOUNDFLAG".equals(firstWord)) {
|
||||
compoundFlag = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("COMPOUNDBEGIN".equals(firstWord)) {
|
||||
compoundBegin = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("COMPOUNDMIDDLE".equals(firstWord)) {
|
||||
|
@ -395,8 +401,16 @@ public class Dictionary {
|
|||
compoundEnd = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("COMPOUNDPERMITFLAG".equals(firstWord)) {
|
||||
compoundPermit = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("COMPOUNDFORBIDFLAG".equals(firstWord)) {
|
||||
compoundForbid = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("CHECKCOMPOUNDCASE".equals(firstWord)) {
|
||||
checkCompoundCase = true;
|
||||
} else if ("CHECKCOMPOUNDDUP".equals(firstWord)) {
|
||||
checkCompoundDup = true;
|
||||
} else if ("CHECKCOMPOUNDTRIPLE".equals(firstWord)) {
|
||||
checkCompoundTriple = true;
|
||||
} else if ("SIMPLIFIEDTRIPLE".equals(firstWord)) {
|
||||
simplifiedTriple = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -16,9 +16,16 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
|
||||
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
|
||||
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
|
||||
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
|
||||
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
/**
|
||||
|
@ -126,7 +133,7 @@ public class SpellChecker {
|
|||
return false;
|
||||
}
|
||||
|
||||
if (hasStems(wordChars, 0, length, originalCase, WordContext.SIMPLE_WORD)) {
|
||||
if (!stemmer.doStem(wordChars, 0, length, originalCase, SIMPLE_WORD).isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -135,12 +142,11 @@ public class SpellChecker {
|
|||
return true;
|
||||
}
|
||||
|
||||
return dictionary.compoundBegin > 0 && checkCompounds(wordChars, 0, length, originalCase, 0);
|
||||
}
|
||||
if (dictionary.compoundBegin != FLAG_UNSET || dictionary.compoundFlag != FLAG_UNSET) {
|
||||
return checkCompounds(wordChars, 0, length, originalCase, 0);
|
||||
}
|
||||
|
||||
private boolean hasStems(
|
||||
char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
|
||||
return !stemmer.doStem(chars, offset, length, originalCase, context).isEmpty();
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean checkCompounds(
|
||||
|
@ -149,12 +155,23 @@ public class SpellChecker {
|
|||
|
||||
int limit = length - dictionary.compoundMin + 1;
|
||||
for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
|
||||
WordContext context = depth == 0 ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_MIDDLE;
|
||||
WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
|
||||
int breakOffset = offset + breakPos;
|
||||
if (checkCompoundCase(chars, breakOffset)
|
||||
&& hasStems(chars, offset, breakPos, originalCase, context)) {
|
||||
if (mayBreakIntoCompounds(chars, offset, length, breakOffset)) {
|
||||
List<CharsRef> stems = stemmer.doStem(chars, offset, breakPos, originalCase, context);
|
||||
if (stems.isEmpty()
|
||||
&& dictionary.simplifiedTriple
|
||||
&& chars[breakOffset - 1] == chars[breakOffset]) {
|
||||
stems = stemmer.doStem(chars, offset, breakPos + 1, originalCase, context);
|
||||
}
|
||||
if (stems.isEmpty()) continue;
|
||||
|
||||
int remainingLength = length - breakPos;
|
||||
if (hasStems(chars, breakOffset, remainingLength, originalCase, WordContext.COMPOUND_END)) {
|
||||
List<CharsRef> lastStems =
|
||||
stemmer.doStem(chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
|
||||
if (!lastStems.isEmpty()
|
||||
&& !(dictionary.checkCompoundDup && intersectIgnoreCase(stems, lastStems))
|
||||
&& !hasForceUCaseProblem(chars, breakOffset, remainingLength, originalCase)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -167,9 +184,37 @@ public class SpellChecker {
|
|||
return false;
|
||||
}
|
||||
|
||||
private boolean checkCompoundCase(char[] chars, int breakPos) {
|
||||
if (!dictionary.checkCompoundCase) return true;
|
||||
return Character.isUpperCase(chars[breakPos - 1]) == Character.isUpperCase(chars[breakPos]);
|
||||
private boolean hasForceUCaseProblem(
|
||||
char[] chars, int offset, int length, WordCase originalCase) {
|
||||
if (dictionary.forceUCase == FLAG_UNSET) return false;
|
||||
if (originalCase == WordCase.TITLE || originalCase == WordCase.UPPER) return false;
|
||||
|
||||
IntsRef forms = dictionary.lookupWord(chars, offset, length);
|
||||
return forms != null && dictionary.hasFlag(forms, dictionary.forceUCase, scratch);
|
||||
}
|
||||
|
||||
private boolean intersectIgnoreCase(List<CharsRef> stems1, List<CharsRef> stems2) {
|
||||
return stems1.stream().anyMatch(s1 -> stems2.stream().anyMatch(s2 -> equalsIgnoreCase(s1, s2)));
|
||||
}
|
||||
|
||||
private boolean equalsIgnoreCase(CharsRef cr1, CharsRef cr2) {
|
||||
return cr1.toString().equalsIgnoreCase(cr2.toString());
|
||||
}
|
||||
|
||||
private boolean mayBreakIntoCompounds(char[] chars, int offset, int length, int breakPos) {
|
||||
if (dictionary.checkCompoundCase) {
|
||||
if (Character.isUpperCase(chars[breakPos - 1]) || Character.isUpperCase(chars[breakPos])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (dictionary.checkCompoundTriple && chars[breakPos - 1] == chars[breakPos]) {
|
||||
//noinspection RedundantIfStatement
|
||||
if (breakPos > offset + 1 && chars[breakPos - 2] == chars[breakPos - 1]
|
||||
|| breakPos < length - 1 && chars[breakPos] == chars[breakPos + 1]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean checkCompoundRules(
|
||||
|
|
|
@ -238,9 +238,15 @@ final class Stemmer {
|
|||
if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
|
||||
continue;
|
||||
}
|
||||
if (context.isCompound()
|
||||
&& !Dictionary.hasFlag(wordFlags, context.requiredFlag(dictionary))) {
|
||||
continue;
|
||||
if (context.isCompound()) {
|
||||
if (context != WordContext.COMPOUND_END
|
||||
&& Dictionary.hasFlag(wordFlags, dictionary.compoundForbid)) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
if (!Dictionary.hasFlag(wordFlags, dictionary.compoundFlag)
|
||||
&& !Dictionary.hasFlag(wordFlags, context.requiredFlag(dictionary))) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
stems.add(newStem(word, offset, length, forms, i));
|
||||
}
|
||||
|
@ -554,7 +560,10 @@ final class Stemmer {
|
|||
WordContext context) {
|
||||
int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
|
||||
|
||||
if (context.isCompound() && dictionary.compoundPermit > 0) {
|
||||
if (context.isCompound()) {
|
||||
if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid, scratch)) {
|
||||
return false;
|
||||
}
|
||||
WordContext allowed = isPrefix ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_END;
|
||||
if (context != allowed && !dictionary.hasFlag(append, dictionary.compoundPermit, scratch)) {
|
||||
return false;
|
||||
|
@ -672,7 +681,10 @@ final class Stemmer {
|
|||
}
|
||||
if (context.isCompound()) {
|
||||
char cFlag = context.requiredFlag(dictionary);
|
||||
if (!Dictionary.hasFlag(wordFlags, cFlag) && !isFlagAppendedByAffix(affix, cFlag)) {
|
||||
if (!Dictionary.hasFlag(wordFlags, cFlag)
|
||||
&& !isFlagAppendedByAffix(affix, cFlag)
|
||||
&& !Dictionary.hasFlag(wordFlags, dictionary.compoundFlag)
|
||||
&& !isFlagAppendedByAffix(affix, dictionary.compoundFlag)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -46,6 +46,11 @@ public class SpellCheckerTest extends StemmerTestBase {
|
|||
doTest("allcaps");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void forceUCase() throws Exception {
|
||||
doTest("forceucase");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void checkSharpS() throws Exception {
|
||||
doTest("checksharps");
|
||||
|
@ -71,6 +76,36 @@ public class SpellCheckerTest extends StemmerTestBase {
|
|||
doTest("needaffix5");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void compoundFlag() throws Exception {
|
||||
doTest("compoundflag");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void checkCompoundCase() throws Exception {
|
||||
doTest("checkcompoundcase");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void checkCompoundDup() throws Exception {
|
||||
doTest("checkcompounddup");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void checkCompoundTriple() throws Exception {
|
||||
doTest("checkcompoundtriple");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void simplifiedTriple() throws Exception {
|
||||
doTest("simplifiedtriple");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void compoundForbid() throws Exception {
|
||||
doTest("compoundforbid");
|
||||
}
|
||||
|
||||
public void testBreak() throws Exception {
|
||||
doTest("break");
|
||||
}
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
# forbid upper case letters at word bounds in compounding
|
||||
CHECKCOMPOUNDCASE
|
||||
COMPOUNDFLAG A
|
|
@ -0,0 +1,5 @@
|
|||
4
|
||||
foo/A
|
||||
Bar/A
|
||||
BAZ/A
|
||||
-/A
|
|
@ -0,0 +1,5 @@
|
|||
Barfoo
|
||||
foo-Bar
|
||||
foo-BAZ
|
||||
BAZ-foo
|
||||
BAZ-Bar
|
|
@ -0,0 +1,3 @@
|
|||
fooBar
|
||||
BAZBar
|
||||
BAZfoo
|
|
@ -0,0 +1,3 @@
|
|||
# Forbid compound word with triple letters
|
||||
CHECKCOMPOUNDDUP
|
||||
COMPOUNDFLAG A
|
|
@ -0,0 +1,3 @@
|
|||
2
|
||||
foo/A
|
||||
bar/A
|
|
@ -0,0 +1,5 @@
|
|||
barfoo
|
||||
foobar
|
||||
foofoobar
|
||||
foobarfoo
|
||||
barfoobarfoo
|
|
@ -0,0 +1,3 @@
|
|||
foofoo
|
||||
foofoofoo
|
||||
foobarbar
|
|
@ -0,0 +1,3 @@
|
|||
# Forbid compound word with triple letters
|
||||
CHECKCOMPOUNDTRIPLE
|
||||
COMPOUNDFLAG A
|
|
@ -0,0 +1,5 @@
|
|||
4
|
||||
foo/A
|
||||
opera/A
|
||||
eel/A
|
||||
bare/A
|
|
@ -0,0 +1,6 @@
|
|||
operafoo
|
||||
operaeel
|
||||
operabare
|
||||
eelbare
|
||||
eelfoo
|
||||
eelopera
|
|
@ -0,0 +1,2 @@
|
|||
fooopera
|
||||
bareeel
|
|
@ -0,0 +1,3 @@
|
|||
COMPOUNDMIN 3
|
||||
COMPOUNDFLAG A
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
4
|
||||
foo/A
|
||||
bar/A
|
||||
xy/A
|
||||
yz/A
|
|
@ -0,0 +1,3 @@
|
|||
foobar
|
||||
barfoo
|
||||
foobarfoo
|
|
@ -0,0 +1,4 @@
|
|||
xyyz
|
||||
fooxy
|
||||
xyfoo
|
||||
fooxybar
|
|
@ -0,0 +1,15 @@
|
|||
# Dictionary words with COMPOUNDFORBIDFLAG are
|
||||
# removed from the beginning and middle of
|
||||
# compound words, overriding the effect of
|
||||
# COMPOUNDPERMITFLAG.
|
||||
#
|
||||
# See compoundaffix3 test for basic usage
|
||||
# of COMPOUNDFORBIDFLAG.
|
||||
|
||||
COMPOUNDFLAG X
|
||||
COMPOUNDPERMITFLAG Y
|
||||
COMPOUNDFORBIDFLAG Z
|
||||
|
||||
SFX S Y 2
|
||||
SFX S 0 bar/YX .
|
||||
SFX S 0 baz/YX .
|
|
@ -0,0 +1,4 @@
|
|||
3
|
||||
foo/S
|
||||
example/X
|
||||
foobaz/Z
|
|
@ -0,0 +1,5 @@
|
|||
foo
|
||||
example
|
||||
foobar
|
||||
foobaz
|
||||
foobarexample
|
|
@ -0,0 +1,3 @@
|
|||
fooexample
|
||||
examplefoo
|
||||
foobazexample
|
|
@ -0,0 +1,4 @@
|
|||
# force capitalized compound
|
||||
TRY F
|
||||
FORCEUCASE A
|
||||
COMPOUNDFLAG C
|
|
@ -0,0 +1,4 @@
|
|||
3
|
||||
foo/C
|
||||
bar/C
|
||||
baz/CA
|
|
@ -0,0 +1,7 @@
|
|||
foo
|
||||
bar
|
||||
baz
|
||||
foobar
|
||||
Foobaz
|
||||
foobazbar
|
||||
Foobarbaz
|
|
@ -0,0 +1,2 @@
|
|||
foobaz
|
||||
foobarbaz
|
|
@ -0,0 +1,8 @@
|
|||
# Forbid compound word with triple letters
|
||||
CHECKCOMPOUNDTRIPLE
|
||||
# Allow simplified forms
|
||||
SIMPLIFIEDTRIPLE
|
||||
|
||||
COMPOUNDMIN 2
|
||||
|
||||
COMPOUNDFLAG A
|
|
@ -0,0 +1,3 @@
|
|||
2
|
||||
glass/A
|
||||
sko/A
|
|
@ -0,0 +1,3 @@
|
|||
glass
|
||||
sko
|
||||
glassko
|
|
@ -0,0 +1 @@
|
|||
glasssko
|
Loading…
Reference in New Issue