LUCENE-9704: Hunspell: support capitalization for German ß (#2260)

This commit is contained in:
Peter Gromov 2021-01-29 10:03:37 +01:00 committed by GitHub
parent 71705c900b
commit 6635d7a5e7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 211 additions and 41 deletions

View File

@ -133,6 +133,7 @@ public class Dictionary {
boolean hasStemExceptions;
boolean ignoreCase;
boolean checkSharpS;
boolean complexPrefixes;
// if no affixes have continuation classes, no need to do 2-level affix stripping
boolean twoStageAffix;
@ -353,6 +354,8 @@ public class Dictionary {
needaffix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("ONLYINCOMPOUND".equals(firstWord)) {
onlyincompound = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("CHECKSHARPS".equals(firstWord)) {
checkSharpS = true;
} else if ("IGNORE".equals(firstWord)) {
ignore = singleArgument(reader, line).toCharArray();
Arrays.sort(ignore);

View File

@ -61,7 +61,7 @@ public class SpellChecker {
return false;
}
if (checkWord(wordChars, wordChars.length, false)) {
if (checkWord(wordChars, wordChars.length, null)) {
return true;
}
@ -89,23 +89,39 @@ public class SpellChecker {
char[] caseVariant = wordChars;
if (wordCase == WordCase.UPPER) {
caseVariant = stemmer.caseFoldTitle(caseVariant, wordChars.length);
if (checkWord(caseVariant, wordChars.length, true)) {
if (checkWord(caseVariant, wordChars.length, wordCase)) {
return true;
}
char[] aposCase = Stemmer.capitalizeAfterApostrophe(caseVariant, wordChars.length);
if (aposCase != null && checkWord(aposCase, aposCase.length, true)) {
if (aposCase != null && checkWord(aposCase, aposCase.length, wordCase)) {
return true;
}
for (char[] variation : stemmer.sharpSVariations(caseVariant, wordChars.length)) {
if (checkWord(variation, variation.length, null)) {
return true;
}
}
return checkWord(stemmer.caseFoldLower(caseVariant, wordChars.length), wordChars.length, true);
}
char[] lower = stemmer.caseFoldLower(caseVariant, wordChars.length);
if (checkWord(lower, wordChars.length, wordCase)) {
return true;
}
if (wordCase == WordCase.UPPER) {
for (char[] variation : stemmer.sharpSVariations(lower, wordChars.length)) {
if (checkWord(variation, variation.length, null)) {
return true;
}
}
}
return false;
}
private boolean checkWord(char[] wordChars, int length, boolean caseVariant) {
private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
return false;
}
if (hasStems(wordChars, 0, length, caseVariant, WordContext.SIMPLE_WORD)) {
if (hasStems(wordChars, 0, length, originalCase, WordContext.SIMPLE_WORD)) {
return true;
}
@ -114,16 +130,16 @@ public class SpellChecker {
return true;
}
return dictionary.compoundBegin > 0 && checkCompounds(wordChars, 0, length, caseVariant, 0);
return dictionary.compoundBegin > 0 && checkCompounds(wordChars, 0, length, originalCase, 0);
}
private boolean hasStems(
char[] chars, int offset, int length, boolean caseVariant, WordContext context) {
return !stemmer.doStem(chars, offset, length, caseVariant, context).isEmpty();
char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
return !stemmer.doStem(chars, offset, length, originalCase, context).isEmpty();
}
private boolean checkCompounds(
char[] chars, int offset, int length, boolean caseVariant, int depth) {
char[] chars, int offset, int length, WordCase originalCase, int depth) {
if (depth > dictionary.compoundMax - 2) return false;
int limit = length - dictionary.compoundMin + 1;
@ -131,13 +147,13 @@ public class SpellChecker {
WordContext context = depth == 0 ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_MIDDLE;
int breakOffset = offset + breakPos;
if (checkCompoundCase(chars, breakOffset)
&& hasStems(chars, offset, breakPos, caseVariant, context)) {
&& hasStems(chars, offset, breakPos, originalCase, context)) {
int remainingLength = length - breakPos;
if (hasStems(chars, breakOffset, remainingLength, caseVariant, WordContext.COMPOUND_END)) {
if (hasStems(chars, breakOffset, remainingLength, originalCase, WordContext.COMPOUND_END)) {
return true;
}
if (checkCompounds(chars, breakOffset, remainingLength, caseVariant, depth + 1)) {
if (checkCompounds(chars, breakOffset, remainingLength, originalCase, depth + 1)) {
return true;
}
}

View File

@ -20,6 +20,8 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
@ -99,20 +101,32 @@ final class Stemmer {
}
WordCase wordCase = caseOf(word, length);
List<CharsRef> list = doStem(word, 0, length, false, WordContext.SIMPLE_WORD);
List<CharsRef> list = doStem(word, 0, length, null, WordContext.SIMPLE_WORD);
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
addCaseVariations(word, length, wordCase, list);
}
return list;
}
private void addCaseVariations(char[] word, int length, WordCase wordCase, List<CharsRef> list) {
if (wordCase == WordCase.UPPER) {
caseFoldTitle(word, length);
char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
if (aposCase != null) {
list.addAll(doStem(aposCase, 0, length, true, WordContext.SIMPLE_WORD));
list.addAll(doStem(aposCase, 0, length, wordCase, WordContext.SIMPLE_WORD));
}
list.addAll(doStem(titleBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD));
for (char[] variation : sharpSVariations(titleBuffer, length)) {
list.addAll(doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD));
}
list.addAll(doStem(titleBuffer, 0, length, true, WordContext.SIMPLE_WORD));
}
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
list.addAll(doStem(lowerBuffer, 0, length, true, WordContext.SIMPLE_WORD));
list.addAll(doStem(lowerBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD));
if (wordCase == WordCase.UPPER) {
for (char[] variation : sharpSVariations(lowerBuffer, length)) {
list.addAll(doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD));
}
}
return list;
}
// temporary buffers for case variants
@ -163,14 +177,52 @@ final class Stemmer {
return null;
}
List<char[]> sharpSVariations(char[] word, int length) {
if (!dictionary.checkSharpS) return Collections.emptyList();
Stream<String> result =
new Object() {
int findSS(int start) {
for (int i = start; i < length - 1; i++) {
if (word[i] == 's' && word[i + 1] == 's') {
return i;
}
}
return -1;
}
Stream<String> replaceSS(int start, int depth) {
if (depth > 5) { // cut off too large enumeration
return Stream.of(new String(word, start, length - start));
}
int ss = findSS(start);
if (ss < 0) {
return null;
} else {
String prefix = new String(word, start, ss - start);
Stream<String> tails = replaceSS(ss + 2, depth + 1);
if (tails == null) {
tails = Stream.of(new String(word, ss + 2, length - ss - 2));
}
return tails.flatMap(s -> Stream.of(prefix + "ss" + s, prefix + "ß" + s));
}
}
}.replaceSS(0, 0);
if (result == null) return Collections.emptyList();
String src = new String(word, 0, length);
return result.filter(s -> !s.equals(src)).map(String::toCharArray).collect(Collectors.toList());
}
List<CharsRef> doStem(
char[] word, int offset, int length, boolean caseVariant, WordContext context) {
char[] word, int offset, int length, WordCase originalCase, WordContext context) {
List<CharsRef> stems = new ArrayList<>();
IntsRef forms = dictionary.lookupWord(word, offset, length);
if (forms != null) {
for (int i = 0; i < forms.length; i += formStep) {
char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch);
if (!acceptCase(caseVariant, wordFlags)) {
if (!acceptCase(originalCase, wordFlags, word, offset, length)) {
continue;
}
// we can't add this form, it's a pseudostem requiring an affix
@ -203,17 +255,35 @@ final class Stemmer {
true,
false,
false,
caseVariant));
originalCase));
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
return stems;
}
private boolean acceptCase(boolean caseVariant, char[] wordFlags) {
return caseVariant
? !Dictionary.hasFlag(wordFlags, dictionary.keepcase)
: !Dictionary.hasHiddenFlag(wordFlags);
private boolean acceptCase(
WordCase originalCase, char[] wordFlags, char[] word, int offset, int length) {
boolean keepCase = Dictionary.hasFlag(wordFlags, dictionary.keepcase);
if (originalCase != null) {
if (keepCase
&& dictionary.checkSharpS
&& originalCase == WordCase.TITLE
&& containsSharpS(word, offset, length)) {
return true;
}
return !keepCase;
}
return !Dictionary.hasHiddenFlag(wordFlags);
}
private boolean containsSharpS(char[] word, int offset, int length) {
for (int i = 0; i < length; i++) {
if (word[i + offset] == 'ß') {
return true;
}
}
return false;
}
/**
@ -302,8 +372,8 @@ final class Stemmer {
* (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
* @param circumfix true if the previous prefix removal was signed as a circumfix this means inner
* most suffix must also contain circumfix flag.
* @param caseVariant true if we are searching for a case variant. if the word has KEEPCASE flag
* it cannot succeed.
* @param originalCase if non-null, represents original word case to disallow case variations of
* word with KEEPCASE flags
* @return List of stems, or empty list if no stems are found
*/
private List<CharsRef> stem(
@ -319,7 +389,7 @@ final class Stemmer {
boolean doSuffix,
boolean previousWasPrefix,
boolean circumfix,
boolean caseVariant)
WordCase originalCase)
throws IOException {
// TODO: allow this stuff to be reused by tokenfilter
@ -371,7 +441,7 @@ final class Stemmer {
recursionDepth,
true,
circumfix,
caseVariant));
originalCase));
}
}
}
@ -424,7 +494,7 @@ final class Stemmer {
recursionDepth,
false,
circumfix,
caseVariant));
originalCase));
}
}
}
@ -555,7 +625,7 @@ final class Stemmer {
int recursionDepth,
boolean prefix,
boolean circumfix,
boolean caseVariant)
WordCase originalCase)
throws IOException {
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
@ -589,7 +659,7 @@ final class Stemmer {
}
// we are looking for a case variant, but this word does not allow it
if (!acceptCase(caseVariant, wordFlags)) {
if (!acceptCase(originalCase, wordFlags, strippedWord, offset, length)) {
continue;
}
if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
@ -654,7 +724,7 @@ final class Stemmer {
true,
prefix,
circumfix,
caseVariant));
originalCase));
}
return stems;

View File

@ -28,9 +28,9 @@ enum WordCase {
boolean seenUpper = false;
boolean seenLower = false;
for (int i = 1; i < length; i++) {
char ch = word[i];
seenUpper = seenUpper || Character.isUpperCase(ch);
seenLower = seenLower || Character.isLowerCase(ch);
CharCase cc = charCase(word[i]);
seenUpper = seenUpper || cc == CharCase.UPPER;
seenLower = seenLower || cc == CharCase.LOWER;
if (seenUpper && seenLower) break;
}
@ -43,9 +43,9 @@ enum WordCase {
boolean seenUpper = false;
boolean seenLower = false;
for (int i = 1; i < length; i++) {
char ch = word.charAt(i);
seenUpper = seenUpper || Character.isUpperCase(ch);
seenLower = seenLower || Character.isLowerCase(ch);
CharCase cc = charCase(word.charAt(i));
seenUpper = seenUpper || cc == CharCase.UPPER;
seenLower = seenLower || cc == CharCase.LOWER;
if (seenUpper && seenLower) break;
}
@ -58,4 +58,20 @@ enum WordCase {
}
return seenUpper ? MIXED : LOWER;
}
private static CharCase charCase(char c) {
if (Character.isUpperCase(c)) {
return CharCase.UPPER;
}
if (Character.isLowerCase(c) && Character.toUpperCase(c) != c) {
return CharCase.LOWER;
}
return CharCase.NEUTRAL;
}
private enum CharCase {
UPPER,
LOWER,
NEUTRAL
}
}

View File

@ -41,6 +41,11 @@ public class SpellCheckerTest extends StemmerTestBase {
doTest("allcaps");
}
@Test
public void checkSharpS() throws Exception {
doTest("checksharps");
}
@Test
public void IJ() throws Exception {
doTest("IJ");

View File

@ -0,0 +1,34 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import org.junit.BeforeClass;
public class TestCheckSharpS extends StemmerTestBase {
@BeforeClass
public static void beforeClass() throws Exception {
init("checksharps.aff", "checksharps.dic");
}
public void testSharpS() {
assertStemsTo("Müßig", "müßig");
assertStemsTo("MÜSSIG", "müßig");
assertStemsTo("Müssig");
assertStemsTo("PROZESSIONSSTRASSE", "Prozessionsstraße");
}
}

View File

@ -0,0 +1,4 @@
# test <20> - SS special capitalizing
CHECKSHARPS
WORDCHARS <20>.
KEEPCASE k

View File

@ -0,0 +1,7 @@
6
müßig/k
Ausstoß
Abstoß.
Außenabmessung
Prozessionsstraße
Außenmaße

View File

@ -0,0 +1,13 @@
müßig
Müßig
MÜSSIG
Ausstoß
Abstoß.
Außenabmessung
Prozessionsstraße
Außenmaße
AUSSTOSS
ABSTOSS.
AUSSENABMESSUNG
PROZESSIONSSTRASSE
AUSSENMASSE

View File

@ -0,0 +1,2 @@
MÜßIG
Müssig