mirror of https://github.com/apache/lucene.git
LUCENE-9704: Hunspell: support capitalization for German ß (#2260)
This commit is contained in:
parent
71705c900b
commit
6635d7a5e7
|
@ -133,6 +133,7 @@ public class Dictionary {
|
|||
boolean hasStemExceptions;
|
||||
|
||||
boolean ignoreCase;
|
||||
boolean checkSharpS;
|
||||
boolean complexPrefixes;
|
||||
// if no affixes have continuation classes, no need to do 2-level affix stripping
|
||||
boolean twoStageAffix;
|
||||
|
@ -353,6 +354,8 @@ public class Dictionary {
|
|||
needaffix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("ONLYINCOMPOUND".equals(firstWord)) {
|
||||
onlyincompound = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("CHECKSHARPS".equals(firstWord)) {
|
||||
checkSharpS = true;
|
||||
} else if ("IGNORE".equals(firstWord)) {
|
||||
ignore = singleArgument(reader, line).toCharArray();
|
||||
Arrays.sort(ignore);
|
||||
|
|
|
@ -61,7 +61,7 @@ public class SpellChecker {
|
|||
return false;
|
||||
}
|
||||
|
||||
if (checkWord(wordChars, wordChars.length, false)) {
|
||||
if (checkWord(wordChars, wordChars.length, null)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -89,23 +89,39 @@ public class SpellChecker {
|
|||
char[] caseVariant = wordChars;
|
||||
if (wordCase == WordCase.UPPER) {
|
||||
caseVariant = stemmer.caseFoldTitle(caseVariant, wordChars.length);
|
||||
if (checkWord(caseVariant, wordChars.length, true)) {
|
||||
if (checkWord(caseVariant, wordChars.length, wordCase)) {
|
||||
return true;
|
||||
}
|
||||
char[] aposCase = Stemmer.capitalizeAfterApostrophe(caseVariant, wordChars.length);
|
||||
if (aposCase != null && checkWord(aposCase, aposCase.length, true)) {
|
||||
if (aposCase != null && checkWord(aposCase, aposCase.length, wordCase)) {
|
||||
return true;
|
||||
}
|
||||
for (char[] variation : stemmer.sharpSVariations(caseVariant, wordChars.length)) {
|
||||
if (checkWord(variation, variation.length, null)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return checkWord(stemmer.caseFoldLower(caseVariant, wordChars.length), wordChars.length, true);
|
||||
char[] lower = stemmer.caseFoldLower(caseVariant, wordChars.length);
|
||||
if (checkWord(lower, wordChars.length, wordCase)) {
|
||||
return true;
|
||||
}
|
||||
if (wordCase == WordCase.UPPER) {
|
||||
for (char[] variation : stemmer.sharpSVariations(lower, wordChars.length)) {
|
||||
if (checkWord(variation, variation.length, null)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean checkWord(char[] wordChars, int length, boolean caseVariant) {
|
||||
private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
|
||||
if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (hasStems(wordChars, 0, length, caseVariant, WordContext.SIMPLE_WORD)) {
|
||||
if (hasStems(wordChars, 0, length, originalCase, WordContext.SIMPLE_WORD)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -114,16 +130,16 @@ public class SpellChecker {
|
|||
return true;
|
||||
}
|
||||
|
||||
return dictionary.compoundBegin > 0 && checkCompounds(wordChars, 0, length, caseVariant, 0);
|
||||
return dictionary.compoundBegin > 0 && checkCompounds(wordChars, 0, length, originalCase, 0);
|
||||
}
|
||||
|
||||
private boolean hasStems(
|
||||
char[] chars, int offset, int length, boolean caseVariant, WordContext context) {
|
||||
return !stemmer.doStem(chars, offset, length, caseVariant, context).isEmpty();
|
||||
char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
|
||||
return !stemmer.doStem(chars, offset, length, originalCase, context).isEmpty();
|
||||
}
|
||||
|
||||
private boolean checkCompounds(
|
||||
char[] chars, int offset, int length, boolean caseVariant, int depth) {
|
||||
char[] chars, int offset, int length, WordCase originalCase, int depth) {
|
||||
if (depth > dictionary.compoundMax - 2) return false;
|
||||
|
||||
int limit = length - dictionary.compoundMin + 1;
|
||||
|
@ -131,13 +147,13 @@ public class SpellChecker {
|
|||
WordContext context = depth == 0 ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_MIDDLE;
|
||||
int breakOffset = offset + breakPos;
|
||||
if (checkCompoundCase(chars, breakOffset)
|
||||
&& hasStems(chars, offset, breakPos, caseVariant, context)) {
|
||||
&& hasStems(chars, offset, breakPos, originalCase, context)) {
|
||||
int remainingLength = length - breakPos;
|
||||
if (hasStems(chars, breakOffset, remainingLength, caseVariant, WordContext.COMPOUND_END)) {
|
||||
if (hasStems(chars, breakOffset, remainingLength, originalCase, WordContext.COMPOUND_END)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (checkCompounds(chars, breakOffset, remainingLength, caseVariant, depth + 1)) {
|
||||
if (checkCompounds(chars, breakOffset, remainingLength, originalCase, depth + 1)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,6 +20,8 @@ import java.io.IOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -99,20 +101,32 @@ final class Stemmer {
|
|||
}
|
||||
|
||||
WordCase wordCase = caseOf(word, length);
|
||||
List<CharsRef> list = doStem(word, 0, length, false, WordContext.SIMPLE_WORD);
|
||||
List<CharsRef> list = doStem(word, 0, length, null, WordContext.SIMPLE_WORD);
|
||||
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
|
||||
addCaseVariations(word, length, wordCase, list);
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
private void addCaseVariations(char[] word, int length, WordCase wordCase, List<CharsRef> list) {
|
||||
if (wordCase == WordCase.UPPER) {
|
||||
caseFoldTitle(word, length);
|
||||
char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
|
||||
if (aposCase != null) {
|
||||
list.addAll(doStem(aposCase, 0, length, true, WordContext.SIMPLE_WORD));
|
||||
list.addAll(doStem(aposCase, 0, length, wordCase, WordContext.SIMPLE_WORD));
|
||||
}
|
||||
list.addAll(doStem(titleBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD));
|
||||
for (char[] variation : sharpSVariations(titleBuffer, length)) {
|
||||
list.addAll(doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD));
|
||||
}
|
||||
list.addAll(doStem(titleBuffer, 0, length, true, WordContext.SIMPLE_WORD));
|
||||
}
|
||||
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
|
||||
caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
|
||||
list.addAll(doStem(lowerBuffer, 0, length, true, WordContext.SIMPLE_WORD));
|
||||
caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
|
||||
list.addAll(doStem(lowerBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD));
|
||||
if (wordCase == WordCase.UPPER) {
|
||||
for (char[] variation : sharpSVariations(lowerBuffer, length)) {
|
||||
list.addAll(doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD));
|
||||
}
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
// temporary buffers for case variants
|
||||
|
@ -163,14 +177,52 @@ final class Stemmer {
|
|||
return null;
|
||||
}
|
||||
|
||||
List<char[]> sharpSVariations(char[] word, int length) {
|
||||
if (!dictionary.checkSharpS) return Collections.emptyList();
|
||||
|
||||
Stream<String> result =
|
||||
new Object() {
|
||||
int findSS(int start) {
|
||||
for (int i = start; i < length - 1; i++) {
|
||||
if (word[i] == 's' && word[i + 1] == 's') {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
Stream<String> replaceSS(int start, int depth) {
|
||||
if (depth > 5) { // cut off too large enumeration
|
||||
return Stream.of(new String(word, start, length - start));
|
||||
}
|
||||
|
||||
int ss = findSS(start);
|
||||
if (ss < 0) {
|
||||
return null;
|
||||
} else {
|
||||
String prefix = new String(word, start, ss - start);
|
||||
Stream<String> tails = replaceSS(ss + 2, depth + 1);
|
||||
if (tails == null) {
|
||||
tails = Stream.of(new String(word, ss + 2, length - ss - 2));
|
||||
}
|
||||
return tails.flatMap(s -> Stream.of(prefix + "ss" + s, prefix + "ß" + s));
|
||||
}
|
||||
}
|
||||
}.replaceSS(0, 0);
|
||||
if (result == null) return Collections.emptyList();
|
||||
|
||||
String src = new String(word, 0, length);
|
||||
return result.filter(s -> !s.equals(src)).map(String::toCharArray).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
List<CharsRef> doStem(
|
||||
char[] word, int offset, int length, boolean caseVariant, WordContext context) {
|
||||
char[] word, int offset, int length, WordCase originalCase, WordContext context) {
|
||||
List<CharsRef> stems = new ArrayList<>();
|
||||
IntsRef forms = dictionary.lookupWord(word, offset, length);
|
||||
if (forms != null) {
|
||||
for (int i = 0; i < forms.length; i += formStep) {
|
||||
char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch);
|
||||
if (!acceptCase(caseVariant, wordFlags)) {
|
||||
if (!acceptCase(originalCase, wordFlags, word, offset, length)) {
|
||||
continue;
|
||||
}
|
||||
// we can't add this form, it's a pseudostem requiring an affix
|
||||
|
@ -203,17 +255,35 @@ final class Stemmer {
|
|||
true,
|
||||
false,
|
||||
false,
|
||||
caseVariant));
|
||||
originalCase));
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
return stems;
|
||||
}
|
||||
|
||||
private boolean acceptCase(boolean caseVariant, char[] wordFlags) {
|
||||
return caseVariant
|
||||
? !Dictionary.hasFlag(wordFlags, dictionary.keepcase)
|
||||
: !Dictionary.hasHiddenFlag(wordFlags);
|
||||
private boolean acceptCase(
|
||||
WordCase originalCase, char[] wordFlags, char[] word, int offset, int length) {
|
||||
boolean keepCase = Dictionary.hasFlag(wordFlags, dictionary.keepcase);
|
||||
if (originalCase != null) {
|
||||
if (keepCase
|
||||
&& dictionary.checkSharpS
|
||||
&& originalCase == WordCase.TITLE
|
||||
&& containsSharpS(word, offset, length)) {
|
||||
return true;
|
||||
}
|
||||
return !keepCase;
|
||||
}
|
||||
return !Dictionary.hasHiddenFlag(wordFlags);
|
||||
}
|
||||
|
||||
private boolean containsSharpS(char[] word, int offset, int length) {
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (word[i + offset] == 'ß') {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -302,8 +372,8 @@ final class Stemmer {
|
|||
* (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
|
||||
* @param circumfix true if the previous prefix removal was signed as a circumfix this means inner
|
||||
* most suffix must also contain circumfix flag.
|
||||
* @param caseVariant true if we are searching for a case variant. if the word has KEEPCASE flag
|
||||
* it cannot succeed.
|
||||
* @param originalCase if non-null, represents original word case to disallow case variations of
|
||||
* word with KEEPCASE flags
|
||||
* @return List of stems, or empty list if no stems are found
|
||||
*/
|
||||
private List<CharsRef> stem(
|
||||
|
@ -319,7 +389,7 @@ final class Stemmer {
|
|||
boolean doSuffix,
|
||||
boolean previousWasPrefix,
|
||||
boolean circumfix,
|
||||
boolean caseVariant)
|
||||
WordCase originalCase)
|
||||
throws IOException {
|
||||
|
||||
// TODO: allow this stuff to be reused by tokenfilter
|
||||
|
@ -371,7 +441,7 @@ final class Stemmer {
|
|||
recursionDepth,
|
||||
true,
|
||||
circumfix,
|
||||
caseVariant));
|
||||
originalCase));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -424,7 +494,7 @@ final class Stemmer {
|
|||
recursionDepth,
|
||||
false,
|
||||
circumfix,
|
||||
caseVariant));
|
||||
originalCase));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -555,7 +625,7 @@ final class Stemmer {
|
|||
int recursionDepth,
|
||||
boolean prefix,
|
||||
boolean circumfix,
|
||||
boolean caseVariant)
|
||||
WordCase originalCase)
|
||||
throws IOException {
|
||||
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
|
||||
|
||||
|
@ -589,7 +659,7 @@ final class Stemmer {
|
|||
}
|
||||
|
||||
// we are looking for a case variant, but this word does not allow it
|
||||
if (!acceptCase(caseVariant, wordFlags)) {
|
||||
if (!acceptCase(originalCase, wordFlags, strippedWord, offset, length)) {
|
||||
continue;
|
||||
}
|
||||
if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
|
||||
|
@ -654,7 +724,7 @@ final class Stemmer {
|
|||
true,
|
||||
prefix,
|
||||
circumfix,
|
||||
caseVariant));
|
||||
originalCase));
|
||||
}
|
||||
|
||||
return stems;
|
||||
|
|
|
@ -28,9 +28,9 @@ enum WordCase {
|
|||
boolean seenUpper = false;
|
||||
boolean seenLower = false;
|
||||
for (int i = 1; i < length; i++) {
|
||||
char ch = word[i];
|
||||
seenUpper = seenUpper || Character.isUpperCase(ch);
|
||||
seenLower = seenLower || Character.isLowerCase(ch);
|
||||
CharCase cc = charCase(word[i]);
|
||||
seenUpper = seenUpper || cc == CharCase.UPPER;
|
||||
seenLower = seenLower || cc == CharCase.LOWER;
|
||||
if (seenUpper && seenLower) break;
|
||||
}
|
||||
|
||||
|
@ -43,9 +43,9 @@ enum WordCase {
|
|||
boolean seenUpper = false;
|
||||
boolean seenLower = false;
|
||||
for (int i = 1; i < length; i++) {
|
||||
char ch = word.charAt(i);
|
||||
seenUpper = seenUpper || Character.isUpperCase(ch);
|
||||
seenLower = seenLower || Character.isLowerCase(ch);
|
||||
CharCase cc = charCase(word.charAt(i));
|
||||
seenUpper = seenUpper || cc == CharCase.UPPER;
|
||||
seenLower = seenLower || cc == CharCase.LOWER;
|
||||
if (seenUpper && seenLower) break;
|
||||
}
|
||||
|
||||
|
@ -58,4 +58,20 @@ enum WordCase {
|
|||
}
|
||||
return seenUpper ? MIXED : LOWER;
|
||||
}
|
||||
|
||||
private static CharCase charCase(char c) {
|
||||
if (Character.isUpperCase(c)) {
|
||||
return CharCase.UPPER;
|
||||
}
|
||||
if (Character.isLowerCase(c) && Character.toUpperCase(c) != c) {
|
||||
return CharCase.LOWER;
|
||||
}
|
||||
return CharCase.NEUTRAL;
|
||||
}
|
||||
|
||||
private enum CharCase {
|
||||
UPPER,
|
||||
LOWER,
|
||||
NEUTRAL
|
||||
}
|
||||
}
|
||||
|
|
|
@ -41,6 +41,11 @@ public class SpellCheckerTest extends StemmerTestBase {
|
|||
doTest("allcaps");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void checkSharpS() throws Exception {
|
||||
doTest("checksharps");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void IJ() throws Exception {
|
||||
doTest("IJ");
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
public class TestCheckSharpS extends StemmerTestBase {
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
init("checksharps.aff", "checksharps.dic");
|
||||
}
|
||||
|
||||
public void testSharpS() {
|
||||
assertStemsTo("Müßig", "müßig");
|
||||
assertStemsTo("MÜSSIG", "müßig");
|
||||
assertStemsTo("Müssig");
|
||||
assertStemsTo("PROZESSIONSSTRASSE", "Prozessionsstraße");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
# test <20> - SS special capitalizing
|
||||
CHECKSHARPS
|
||||
WORDCHARS <20>.
|
||||
KEEPCASE k
|
|
@ -0,0 +1,7 @@
|
|||
6
|
||||
müßig/k
|
||||
Ausstoß
|
||||
Abstoß.
|
||||
Außenabmessung
|
||||
Prozessionsstraße
|
||||
Außenmaße
|
|
@ -0,0 +1,13 @@
|
|||
müßig
|
||||
Müßig
|
||||
MÜSSIG
|
||||
Ausstoß
|
||||
Abstoß.
|
||||
Außenabmessung
|
||||
Prozessionsstraße
|
||||
Außenmaße
|
||||
AUSSTOSS
|
||||
ABSTOSS.
|
||||
AUSSENABMESSUNG
|
||||
PROZESSIONSSTRASSE
|
||||
AUSSENMASSE
|
|
@ -0,0 +1,2 @@
|
|||
MÜßIG
|
||||
Müssig
|
Loading…
Reference in New Issue