mirror of https://github.com/apache/lucene.git
LUCENE-9710: Hunspell: support minor compounding-related flags (#2272)
* LUCENE-9710: Hunspell: support COMPOUNDFLAG * LUCENE-9710: Hunspell: fix CHECKCOMPOUNDCASE support * LUCENE-9710: Hunspell: support CHECKCOMPOUNDDUP * LUCENE-9710: Hunspell: support triple flags (CHECKCOMPOUNDTRIPLE, SIMPLIFIEDTRIPLE) * LUCENE-9710: Hunspell: support COMPOUNDFORBIDFLAG * LUCENE-9710: Hunspell: support FORCEUCASE
This commit is contained in:
parent
40e92315ae
commit
9d45dfe776
|
@ -139,11 +139,13 @@ public class Dictionary {
|
||||||
boolean twoStageAffix;
|
boolean twoStageAffix;
|
||||||
|
|
||||||
char circumfix;
|
char circumfix;
|
||||||
char keepcase;
|
char keepcase, forceUCase;
|
||||||
char needaffix;
|
char needaffix;
|
||||||
char forbiddenword;
|
char forbiddenword;
|
||||||
char onlyincompound, compoundBegin, compoundMiddle, compoundEnd, compoundPermit;
|
char onlyincompound, compoundBegin, compoundMiddle, compoundEnd, compoundFlag;
|
||||||
boolean checkCompoundCase;
|
char compoundPermit, compoundForbid;
|
||||||
|
boolean checkCompoundCase, checkCompoundDup;
|
||||||
|
boolean checkCompoundTriple, simplifiedTriple;
|
||||||
int compoundMin = 3, compoundMax = Integer.MAX_VALUE;
|
int compoundMin = 3, compoundMax = Integer.MAX_VALUE;
|
||||||
List<CompoundRule> compoundRules; // nullable
|
List<CompoundRule> compoundRules; // nullable
|
||||||
|
|
||||||
|
@ -350,6 +352,8 @@ public class Dictionary {
|
||||||
circumfix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
circumfix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||||
} else if ("KEEPCASE".equals(firstWord)) {
|
} else if ("KEEPCASE".equals(firstWord)) {
|
||||||
keepcase = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
keepcase = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||||
|
} else if ("FORCEUCASE".equals(firstWord)) {
|
||||||
|
forceUCase = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||||
} else if ("NEEDAFFIX".equals(firstWord) || "PSEUDOROOT".equals(firstWord)) {
|
} else if ("NEEDAFFIX".equals(firstWord) || "PSEUDOROOT".equals(firstWord)) {
|
||||||
needaffix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
needaffix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||||
} else if ("ONLYINCOMPOUND".equals(firstWord)) {
|
} else if ("ONLYINCOMPOUND".equals(firstWord)) {
|
||||||
|
@ -387,6 +391,8 @@ public class Dictionary {
|
||||||
compoundMax = Math.max(1, Integer.parseInt(singleArgument(reader, line)));
|
compoundMax = Math.max(1, Integer.parseInt(singleArgument(reader, line)));
|
||||||
} else if ("COMPOUNDRULE".equals(firstWord)) {
|
} else if ("COMPOUNDRULE".equals(firstWord)) {
|
||||||
compoundRules = parseCompoundRules(reader, Integer.parseInt(singleArgument(reader, line)));
|
compoundRules = parseCompoundRules(reader, Integer.parseInt(singleArgument(reader, line)));
|
||||||
|
} else if ("COMPOUNDFLAG".equals(firstWord)) {
|
||||||
|
compoundFlag = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||||
} else if ("COMPOUNDBEGIN".equals(firstWord)) {
|
} else if ("COMPOUNDBEGIN".equals(firstWord)) {
|
||||||
compoundBegin = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
compoundBegin = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||||
} else if ("COMPOUNDMIDDLE".equals(firstWord)) {
|
} else if ("COMPOUNDMIDDLE".equals(firstWord)) {
|
||||||
|
@ -395,8 +401,16 @@ public class Dictionary {
|
||||||
compoundEnd = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
compoundEnd = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||||
} else if ("COMPOUNDPERMITFLAG".equals(firstWord)) {
|
} else if ("COMPOUNDPERMITFLAG".equals(firstWord)) {
|
||||||
compoundPermit = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
compoundPermit = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||||
|
} else if ("COMPOUNDFORBIDFLAG".equals(firstWord)) {
|
||||||
|
compoundForbid = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||||
} else if ("CHECKCOMPOUNDCASE".equals(firstWord)) {
|
} else if ("CHECKCOMPOUNDCASE".equals(firstWord)) {
|
||||||
checkCompoundCase = true;
|
checkCompoundCase = true;
|
||||||
|
} else if ("CHECKCOMPOUNDDUP".equals(firstWord)) {
|
||||||
|
checkCompoundDup = true;
|
||||||
|
} else if ("CHECKCOMPOUNDTRIPLE".equals(firstWord)) {
|
||||||
|
checkCompoundTriple = true;
|
||||||
|
} else if ("SIMPLIFIEDTRIPLE".equals(firstWord)) {
|
||||||
|
simplifiedTriple = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -16,9 +16,16 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.analysis.hunspell;
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
|
||||||
|
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
|
||||||
|
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
|
||||||
|
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
|
||||||
|
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -126,7 +133,7 @@ public class SpellChecker {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hasStems(wordChars, 0, length, originalCase, WordContext.SIMPLE_WORD)) {
|
if (!stemmer.doStem(wordChars, 0, length, originalCase, SIMPLE_WORD).isEmpty()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -135,12 +142,11 @@ public class SpellChecker {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
return dictionary.compoundBegin > 0 && checkCompounds(wordChars, 0, length, originalCase, 0);
|
if (dictionary.compoundBegin != FLAG_UNSET || dictionary.compoundFlag != FLAG_UNSET) {
|
||||||
}
|
return checkCompounds(wordChars, 0, length, originalCase, 0);
|
||||||
|
}
|
||||||
|
|
||||||
private boolean hasStems(
|
return false;
|
||||||
char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
|
|
||||||
return !stemmer.doStem(chars, offset, length, originalCase, context).isEmpty();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean checkCompounds(
|
private boolean checkCompounds(
|
||||||
|
@ -149,12 +155,23 @@ public class SpellChecker {
|
||||||
|
|
||||||
int limit = length - dictionary.compoundMin + 1;
|
int limit = length - dictionary.compoundMin + 1;
|
||||||
for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
|
for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
|
||||||
WordContext context = depth == 0 ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_MIDDLE;
|
WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
|
||||||
int breakOffset = offset + breakPos;
|
int breakOffset = offset + breakPos;
|
||||||
if (checkCompoundCase(chars, breakOffset)
|
if (mayBreakIntoCompounds(chars, offset, length, breakOffset)) {
|
||||||
&& hasStems(chars, offset, breakPos, originalCase, context)) {
|
List<CharsRef> stems = stemmer.doStem(chars, offset, breakPos, originalCase, context);
|
||||||
|
if (stems.isEmpty()
|
||||||
|
&& dictionary.simplifiedTriple
|
||||||
|
&& chars[breakOffset - 1] == chars[breakOffset]) {
|
||||||
|
stems = stemmer.doStem(chars, offset, breakPos + 1, originalCase, context);
|
||||||
|
}
|
||||||
|
if (stems.isEmpty()) continue;
|
||||||
|
|
||||||
int remainingLength = length - breakPos;
|
int remainingLength = length - breakPos;
|
||||||
if (hasStems(chars, breakOffset, remainingLength, originalCase, WordContext.COMPOUND_END)) {
|
List<CharsRef> lastStems =
|
||||||
|
stemmer.doStem(chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
|
||||||
|
if (!lastStems.isEmpty()
|
||||||
|
&& !(dictionary.checkCompoundDup && intersectIgnoreCase(stems, lastStems))
|
||||||
|
&& !hasForceUCaseProblem(chars, breakOffset, remainingLength, originalCase)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -167,9 +184,37 @@ public class SpellChecker {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean checkCompoundCase(char[] chars, int breakPos) {
|
private boolean hasForceUCaseProblem(
|
||||||
if (!dictionary.checkCompoundCase) return true;
|
char[] chars, int offset, int length, WordCase originalCase) {
|
||||||
return Character.isUpperCase(chars[breakPos - 1]) == Character.isUpperCase(chars[breakPos]);
|
if (dictionary.forceUCase == FLAG_UNSET) return false;
|
||||||
|
if (originalCase == WordCase.TITLE || originalCase == WordCase.UPPER) return false;
|
||||||
|
|
||||||
|
IntsRef forms = dictionary.lookupWord(chars, offset, length);
|
||||||
|
return forms != null && dictionary.hasFlag(forms, dictionary.forceUCase, scratch);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean intersectIgnoreCase(List<CharsRef> stems1, List<CharsRef> stems2) {
|
||||||
|
return stems1.stream().anyMatch(s1 -> stems2.stream().anyMatch(s2 -> equalsIgnoreCase(s1, s2)));
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean equalsIgnoreCase(CharsRef cr1, CharsRef cr2) {
|
||||||
|
return cr1.toString().equalsIgnoreCase(cr2.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean mayBreakIntoCompounds(char[] chars, int offset, int length, int breakPos) {
|
||||||
|
if (dictionary.checkCompoundCase) {
|
||||||
|
if (Character.isUpperCase(chars[breakPos - 1]) || Character.isUpperCase(chars[breakPos])) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (dictionary.checkCompoundTriple && chars[breakPos - 1] == chars[breakPos]) {
|
||||||
|
//noinspection RedundantIfStatement
|
||||||
|
if (breakPos > offset + 1 && chars[breakPos - 2] == chars[breakPos - 1]
|
||||||
|
|| breakPos < length - 1 && chars[breakPos] == chars[breakPos + 1]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean checkCompoundRules(
|
private boolean checkCompoundRules(
|
||||||
|
|
|
@ -238,9 +238,15 @@ final class Stemmer {
|
||||||
if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
|
if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (context.isCompound()
|
if (context.isCompound()) {
|
||||||
&& !Dictionary.hasFlag(wordFlags, context.requiredFlag(dictionary))) {
|
if (context != WordContext.COMPOUND_END
|
||||||
continue;
|
&& Dictionary.hasFlag(wordFlags, dictionary.compoundForbid)) {
|
||||||
|
return new ArrayList<>();
|
||||||
|
}
|
||||||
|
if (!Dictionary.hasFlag(wordFlags, dictionary.compoundFlag)
|
||||||
|
&& !Dictionary.hasFlag(wordFlags, context.requiredFlag(dictionary))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
stems.add(newStem(word, offset, length, forms, i));
|
stems.add(newStem(word, offset, length, forms, i));
|
||||||
}
|
}
|
||||||
|
@ -554,7 +560,10 @@ final class Stemmer {
|
||||||
WordContext context) {
|
WordContext context) {
|
||||||
int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
|
int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
|
||||||
|
|
||||||
if (context.isCompound() && dictionary.compoundPermit > 0) {
|
if (context.isCompound()) {
|
||||||
|
if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid, scratch)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
WordContext allowed = isPrefix ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_END;
|
WordContext allowed = isPrefix ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_END;
|
||||||
if (context != allowed && !dictionary.hasFlag(append, dictionary.compoundPermit, scratch)) {
|
if (context != allowed && !dictionary.hasFlag(append, dictionary.compoundPermit, scratch)) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -672,7 +681,10 @@ final class Stemmer {
|
||||||
}
|
}
|
||||||
if (context.isCompound()) {
|
if (context.isCompound()) {
|
||||||
char cFlag = context.requiredFlag(dictionary);
|
char cFlag = context.requiredFlag(dictionary);
|
||||||
if (!Dictionary.hasFlag(wordFlags, cFlag) && !isFlagAppendedByAffix(affix, cFlag)) {
|
if (!Dictionary.hasFlag(wordFlags, cFlag)
|
||||||
|
&& !isFlagAppendedByAffix(affix, cFlag)
|
||||||
|
&& !Dictionary.hasFlag(wordFlags, dictionary.compoundFlag)
|
||||||
|
&& !isFlagAppendedByAffix(affix, dictionary.compoundFlag)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,6 +46,11 @@ public class SpellCheckerTest extends StemmerTestBase {
|
||||||
doTest("allcaps");
|
doTest("allcaps");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void forceUCase() throws Exception {
|
||||||
|
doTest("forceucase");
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void checkSharpS() throws Exception {
|
public void checkSharpS() throws Exception {
|
||||||
doTest("checksharps");
|
doTest("checksharps");
|
||||||
|
@ -71,6 +76,36 @@ public class SpellCheckerTest extends StemmerTestBase {
|
||||||
doTest("needaffix5");
|
doTest("needaffix5");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void compoundFlag() throws Exception {
|
||||||
|
doTest("compoundflag");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void checkCompoundCase() throws Exception {
|
||||||
|
doTest("checkcompoundcase");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void checkCompoundDup() throws Exception {
|
||||||
|
doTest("checkcompounddup");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void checkCompoundTriple() throws Exception {
|
||||||
|
doTest("checkcompoundtriple");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void simplifiedTriple() throws Exception {
|
||||||
|
doTest("simplifiedtriple");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void compoundForbid() throws Exception {
|
||||||
|
doTest("compoundforbid");
|
||||||
|
}
|
||||||
|
|
||||||
public void testBreak() throws Exception {
|
public void testBreak() throws Exception {
|
||||||
doTest("break");
|
doTest("break");
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
# forbid upper case letters at word bounds in compounding
|
||||||
|
CHECKCOMPOUNDCASE
|
||||||
|
COMPOUNDFLAG A
|
|
@ -0,0 +1,5 @@
|
||||||
|
4
|
||||||
|
foo/A
|
||||||
|
Bar/A
|
||||||
|
BAZ/A
|
||||||
|
-/A
|
|
@ -0,0 +1,5 @@
|
||||||
|
Barfoo
|
||||||
|
foo-Bar
|
||||||
|
foo-BAZ
|
||||||
|
BAZ-foo
|
||||||
|
BAZ-Bar
|
|
@ -0,0 +1,3 @@
|
||||||
|
fooBar
|
||||||
|
BAZBar
|
||||||
|
BAZfoo
|
|
@ -0,0 +1,3 @@
|
||||||
|
# Forbid compound word with triple letters
|
||||||
|
CHECKCOMPOUNDDUP
|
||||||
|
COMPOUNDFLAG A
|
|
@ -0,0 +1,3 @@
|
||||||
|
2
|
||||||
|
foo/A
|
||||||
|
bar/A
|
|
@ -0,0 +1,5 @@
|
||||||
|
barfoo
|
||||||
|
foobar
|
||||||
|
foofoobar
|
||||||
|
foobarfoo
|
||||||
|
barfoobarfoo
|
|
@ -0,0 +1,3 @@
|
||||||
|
foofoo
|
||||||
|
foofoofoo
|
||||||
|
foobarbar
|
|
@ -0,0 +1,3 @@
|
||||||
|
# Forbid compound word with triple letters
|
||||||
|
CHECKCOMPOUNDTRIPLE
|
||||||
|
COMPOUNDFLAG A
|
|
@ -0,0 +1,5 @@
|
||||||
|
4
|
||||||
|
foo/A
|
||||||
|
opera/A
|
||||||
|
eel/A
|
||||||
|
bare/A
|
|
@ -0,0 +1,6 @@
|
||||||
|
operafoo
|
||||||
|
operaeel
|
||||||
|
operabare
|
||||||
|
eelbare
|
||||||
|
eelfoo
|
||||||
|
eelopera
|
|
@ -0,0 +1,2 @@
|
||||||
|
fooopera
|
||||||
|
bareeel
|
|
@ -0,0 +1,3 @@
|
||||||
|
COMPOUNDMIN 3
|
||||||
|
COMPOUNDFLAG A
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
4
|
||||||
|
foo/A
|
||||||
|
bar/A
|
||||||
|
xy/A
|
||||||
|
yz/A
|
|
@ -0,0 +1,3 @@
|
||||||
|
foobar
|
||||||
|
barfoo
|
||||||
|
foobarfoo
|
|
@ -0,0 +1,4 @@
|
||||||
|
xyyz
|
||||||
|
fooxy
|
||||||
|
xyfoo
|
||||||
|
fooxybar
|
|
@ -0,0 +1,15 @@
|
||||||
|
# Dictionary words with COMPOUNDFORBIDFLAG are
|
||||||
|
# removed from the beginning and middle of
|
||||||
|
# compound words, overriding the effect of
|
||||||
|
# COMPOUNDPERMITFLAG.
|
||||||
|
#
|
||||||
|
# See compoundaffix3 test for basic usage
|
||||||
|
# of COMPOUNDFORBIDFLAG.
|
||||||
|
|
||||||
|
COMPOUNDFLAG X
|
||||||
|
COMPOUNDPERMITFLAG Y
|
||||||
|
COMPOUNDFORBIDFLAG Z
|
||||||
|
|
||||||
|
SFX S Y 2
|
||||||
|
SFX S 0 bar/YX .
|
||||||
|
SFX S 0 baz/YX .
|
|
@ -0,0 +1,4 @@
|
||||||
|
3
|
||||||
|
foo/S
|
||||||
|
example/X
|
||||||
|
foobaz/Z
|
|
@ -0,0 +1,5 @@
|
||||||
|
foo
|
||||||
|
example
|
||||||
|
foobar
|
||||||
|
foobaz
|
||||||
|
foobarexample
|
|
@ -0,0 +1,3 @@
|
||||||
|
fooexample
|
||||||
|
examplefoo
|
||||||
|
foobazexample
|
|
@ -0,0 +1,4 @@
|
||||||
|
# force capitalized compound
|
||||||
|
TRY F
|
||||||
|
FORCEUCASE A
|
||||||
|
COMPOUNDFLAG C
|
|
@ -0,0 +1,4 @@
|
||||||
|
3
|
||||||
|
foo/C
|
||||||
|
bar/C
|
||||||
|
baz/CA
|
|
@ -0,0 +1,7 @@
|
||||||
|
foo
|
||||||
|
bar
|
||||||
|
baz
|
||||||
|
foobar
|
||||||
|
Foobaz
|
||||||
|
foobazbar
|
||||||
|
Foobarbaz
|
|
@ -0,0 +1,2 @@
|
||||||
|
foobaz
|
||||||
|
foobarbaz
|
|
@ -0,0 +1,8 @@
|
||||||
|
# Forbid compound word with triple letters
|
||||||
|
CHECKCOMPOUNDTRIPLE
|
||||||
|
# Allow simplified forms
|
||||||
|
SIMPLIFIEDTRIPLE
|
||||||
|
|
||||||
|
COMPOUNDMIN 2
|
||||||
|
|
||||||
|
COMPOUNDFLAG A
|
|
@ -0,0 +1,3 @@
|
||||||
|
2
|
||||||
|
glass/A
|
||||||
|
sko/A
|
|
@ -0,0 +1,3 @@
|
||||||
|
glass
|
||||||
|
sko
|
||||||
|
glassko
|
|
@ -0,0 +1 @@
|
||||||
|
glasssko
|
Loading…
Reference in New Issue