mirror of https://github.com/apache/lucene.git
LUCENE-9704: Hunspell: support capitalization for German ß (#2260)
This commit is contained in:
parent
71705c900b
commit
6635d7a5e7
|
@ -133,6 +133,7 @@ public class Dictionary {
|
||||||
boolean hasStemExceptions;
|
boolean hasStemExceptions;
|
||||||
|
|
||||||
boolean ignoreCase;
|
boolean ignoreCase;
|
||||||
|
boolean checkSharpS;
|
||||||
boolean complexPrefixes;
|
boolean complexPrefixes;
|
||||||
// if no affixes have continuation classes, no need to do 2-level affix stripping
|
// if no affixes have continuation classes, no need to do 2-level affix stripping
|
||||||
boolean twoStageAffix;
|
boolean twoStageAffix;
|
||||||
|
@ -353,6 +354,8 @@ public class Dictionary {
|
||||||
needaffix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
needaffix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||||
} else if ("ONLYINCOMPOUND".equals(firstWord)) {
|
} else if ("ONLYINCOMPOUND".equals(firstWord)) {
|
||||||
onlyincompound = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
onlyincompound = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||||
|
} else if ("CHECKSHARPS".equals(firstWord)) {
|
||||||
|
checkSharpS = true;
|
||||||
} else if ("IGNORE".equals(firstWord)) {
|
} else if ("IGNORE".equals(firstWord)) {
|
||||||
ignore = singleArgument(reader, line).toCharArray();
|
ignore = singleArgument(reader, line).toCharArray();
|
||||||
Arrays.sort(ignore);
|
Arrays.sort(ignore);
|
||||||
|
|
|
@ -61,7 +61,7 @@ public class SpellChecker {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (checkWord(wordChars, wordChars.length, false)) {
|
if (checkWord(wordChars, wordChars.length, null)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -89,23 +89,39 @@ public class SpellChecker {
|
||||||
char[] caseVariant = wordChars;
|
char[] caseVariant = wordChars;
|
||||||
if (wordCase == WordCase.UPPER) {
|
if (wordCase == WordCase.UPPER) {
|
||||||
caseVariant = stemmer.caseFoldTitle(caseVariant, wordChars.length);
|
caseVariant = stemmer.caseFoldTitle(caseVariant, wordChars.length);
|
||||||
if (checkWord(caseVariant, wordChars.length, true)) {
|
if (checkWord(caseVariant, wordChars.length, wordCase)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
char[] aposCase = Stemmer.capitalizeAfterApostrophe(caseVariant, wordChars.length);
|
char[] aposCase = Stemmer.capitalizeAfterApostrophe(caseVariant, wordChars.length);
|
||||||
if (aposCase != null && checkWord(aposCase, aposCase.length, true)) {
|
if (aposCase != null && checkWord(aposCase, aposCase.length, wordCase)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
for (char[] variation : stemmer.sharpSVariations(caseVariant, wordChars.length)) {
|
||||||
|
if (checkWord(variation, variation.length, null)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return checkWord(stemmer.caseFoldLower(caseVariant, wordChars.length), wordChars.length, true);
|
char[] lower = stemmer.caseFoldLower(caseVariant, wordChars.length);
|
||||||
|
if (checkWord(lower, wordChars.length, wordCase)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (wordCase == WordCase.UPPER) {
|
||||||
|
for (char[] variation : stemmer.sharpSVariations(lower, wordChars.length)) {
|
||||||
|
if (checkWord(variation, variation.length, null)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean checkWord(char[] wordChars, int length, boolean caseVariant) {
|
private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
|
||||||
if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
|
if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hasStems(wordChars, 0, length, caseVariant, WordContext.SIMPLE_WORD)) {
|
if (hasStems(wordChars, 0, length, originalCase, WordContext.SIMPLE_WORD)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -114,16 +130,16 @@ public class SpellChecker {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
return dictionary.compoundBegin > 0 && checkCompounds(wordChars, 0, length, caseVariant, 0);
|
return dictionary.compoundBegin > 0 && checkCompounds(wordChars, 0, length, originalCase, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean hasStems(
|
private boolean hasStems(
|
||||||
char[] chars, int offset, int length, boolean caseVariant, WordContext context) {
|
char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
|
||||||
return !stemmer.doStem(chars, offset, length, caseVariant, context).isEmpty();
|
return !stemmer.doStem(chars, offset, length, originalCase, context).isEmpty();
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean checkCompounds(
|
private boolean checkCompounds(
|
||||||
char[] chars, int offset, int length, boolean caseVariant, int depth) {
|
char[] chars, int offset, int length, WordCase originalCase, int depth) {
|
||||||
if (depth > dictionary.compoundMax - 2) return false;
|
if (depth > dictionary.compoundMax - 2) return false;
|
||||||
|
|
||||||
int limit = length - dictionary.compoundMin + 1;
|
int limit = length - dictionary.compoundMin + 1;
|
||||||
|
@ -131,13 +147,13 @@ public class SpellChecker {
|
||||||
WordContext context = depth == 0 ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_MIDDLE;
|
WordContext context = depth == 0 ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_MIDDLE;
|
||||||
int breakOffset = offset + breakPos;
|
int breakOffset = offset + breakPos;
|
||||||
if (checkCompoundCase(chars, breakOffset)
|
if (checkCompoundCase(chars, breakOffset)
|
||||||
&& hasStems(chars, offset, breakPos, caseVariant, context)) {
|
&& hasStems(chars, offset, breakPos, originalCase, context)) {
|
||||||
int remainingLength = length - breakPos;
|
int remainingLength = length - breakPos;
|
||||||
if (hasStems(chars, breakOffset, remainingLength, caseVariant, WordContext.COMPOUND_END)) {
|
if (hasStems(chars, breakOffset, remainingLength, originalCase, WordContext.COMPOUND_END)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (checkCompounds(chars, breakOffset, remainingLength, caseVariant, depth + 1)) {
|
if (checkCompounds(chars, breakOffset, remainingLength, originalCase, depth + 1)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,8 @@ import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
@ -99,20 +101,32 @@ final class Stemmer {
|
||||||
}
|
}
|
||||||
|
|
||||||
WordCase wordCase = caseOf(word, length);
|
WordCase wordCase = caseOf(word, length);
|
||||||
List<CharsRef> list = doStem(word, 0, length, false, WordContext.SIMPLE_WORD);
|
List<CharsRef> list = doStem(word, 0, length, null, WordContext.SIMPLE_WORD);
|
||||||
|
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
|
||||||
|
addCaseVariations(word, length, wordCase, list);
|
||||||
|
}
|
||||||
|
return list;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addCaseVariations(char[] word, int length, WordCase wordCase, List<CharsRef> list) {
|
||||||
if (wordCase == WordCase.UPPER) {
|
if (wordCase == WordCase.UPPER) {
|
||||||
caseFoldTitle(word, length);
|
caseFoldTitle(word, length);
|
||||||
char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
|
char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
|
||||||
if (aposCase != null) {
|
if (aposCase != null) {
|
||||||
list.addAll(doStem(aposCase, 0, length, true, WordContext.SIMPLE_WORD));
|
list.addAll(doStem(aposCase, 0, length, wordCase, WordContext.SIMPLE_WORD));
|
||||||
|
}
|
||||||
|
list.addAll(doStem(titleBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD));
|
||||||
|
for (char[] variation : sharpSVariations(titleBuffer, length)) {
|
||||||
|
list.addAll(doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD));
|
||||||
}
|
}
|
||||||
list.addAll(doStem(titleBuffer, 0, length, true, WordContext.SIMPLE_WORD));
|
|
||||||
}
|
}
|
||||||
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
|
caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
|
||||||
caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
|
list.addAll(doStem(lowerBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD));
|
||||||
list.addAll(doStem(lowerBuffer, 0, length, true, WordContext.SIMPLE_WORD));
|
if (wordCase == WordCase.UPPER) {
|
||||||
|
for (char[] variation : sharpSVariations(lowerBuffer, length)) {
|
||||||
|
list.addAll(doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return list;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// temporary buffers for case variants
|
// temporary buffers for case variants
|
||||||
|
@ -163,14 +177,52 @@ final class Stemmer {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
List<char[]> sharpSVariations(char[] word, int length) {
|
||||||
|
if (!dictionary.checkSharpS) return Collections.emptyList();
|
||||||
|
|
||||||
|
Stream<String> result =
|
||||||
|
new Object() {
|
||||||
|
int findSS(int start) {
|
||||||
|
for (int i = start; i < length - 1; i++) {
|
||||||
|
if (word[i] == 's' && word[i + 1] == 's') {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
Stream<String> replaceSS(int start, int depth) {
|
||||||
|
if (depth > 5) { // cut off too large enumeration
|
||||||
|
return Stream.of(new String(word, start, length - start));
|
||||||
|
}
|
||||||
|
|
||||||
|
int ss = findSS(start);
|
||||||
|
if (ss < 0) {
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
|
String prefix = new String(word, start, ss - start);
|
||||||
|
Stream<String> tails = replaceSS(ss + 2, depth + 1);
|
||||||
|
if (tails == null) {
|
||||||
|
tails = Stream.of(new String(word, ss + 2, length - ss - 2));
|
||||||
|
}
|
||||||
|
return tails.flatMap(s -> Stream.of(prefix + "ss" + s, prefix + "ß" + s));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}.replaceSS(0, 0);
|
||||||
|
if (result == null) return Collections.emptyList();
|
||||||
|
|
||||||
|
String src = new String(word, 0, length);
|
||||||
|
return result.filter(s -> !s.equals(src)).map(String::toCharArray).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
List<CharsRef> doStem(
|
List<CharsRef> doStem(
|
||||||
char[] word, int offset, int length, boolean caseVariant, WordContext context) {
|
char[] word, int offset, int length, WordCase originalCase, WordContext context) {
|
||||||
List<CharsRef> stems = new ArrayList<>();
|
List<CharsRef> stems = new ArrayList<>();
|
||||||
IntsRef forms = dictionary.lookupWord(word, offset, length);
|
IntsRef forms = dictionary.lookupWord(word, offset, length);
|
||||||
if (forms != null) {
|
if (forms != null) {
|
||||||
for (int i = 0; i < forms.length; i += formStep) {
|
for (int i = 0; i < forms.length; i += formStep) {
|
||||||
char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch);
|
char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch);
|
||||||
if (!acceptCase(caseVariant, wordFlags)) {
|
if (!acceptCase(originalCase, wordFlags, word, offset, length)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// we can't add this form, it's a pseudostem requiring an affix
|
// we can't add this form, it's a pseudostem requiring an affix
|
||||||
|
@ -203,17 +255,35 @@ final class Stemmer {
|
||||||
true,
|
true,
|
||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
caseVariant));
|
originalCase));
|
||||||
} catch (IOException bogus) {
|
} catch (IOException bogus) {
|
||||||
throw new RuntimeException(bogus);
|
throw new RuntimeException(bogus);
|
||||||
}
|
}
|
||||||
return stems;
|
return stems;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean acceptCase(boolean caseVariant, char[] wordFlags) {
|
private boolean acceptCase(
|
||||||
return caseVariant
|
WordCase originalCase, char[] wordFlags, char[] word, int offset, int length) {
|
||||||
? !Dictionary.hasFlag(wordFlags, dictionary.keepcase)
|
boolean keepCase = Dictionary.hasFlag(wordFlags, dictionary.keepcase);
|
||||||
: !Dictionary.hasHiddenFlag(wordFlags);
|
if (originalCase != null) {
|
||||||
|
if (keepCase
|
||||||
|
&& dictionary.checkSharpS
|
||||||
|
&& originalCase == WordCase.TITLE
|
||||||
|
&& containsSharpS(word, offset, length)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return !keepCase;
|
||||||
|
}
|
||||||
|
return !Dictionary.hasHiddenFlag(wordFlags);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean containsSharpS(char[] word, int offset, int length) {
|
||||||
|
for (int i = 0; i < length; i++) {
|
||||||
|
if (word[i + offset] == 'ß') {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -302,8 +372,8 @@ final class Stemmer {
|
||||||
* (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
|
* (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
|
||||||
* @param circumfix true if the previous prefix removal was signed as a circumfix this means inner
|
* @param circumfix true if the previous prefix removal was signed as a circumfix this means inner
|
||||||
* most suffix must also contain circumfix flag.
|
* most suffix must also contain circumfix flag.
|
||||||
* @param caseVariant true if we are searching for a case variant. if the word has KEEPCASE flag
|
* @param originalCase if non-null, represents original word case to disallow case variations of
|
||||||
* it cannot succeed.
|
* word with KEEPCASE flags
|
||||||
* @return List of stems, or empty list if no stems are found
|
* @return List of stems, or empty list if no stems are found
|
||||||
*/
|
*/
|
||||||
private List<CharsRef> stem(
|
private List<CharsRef> stem(
|
||||||
|
@ -319,7 +389,7 @@ final class Stemmer {
|
||||||
boolean doSuffix,
|
boolean doSuffix,
|
||||||
boolean previousWasPrefix,
|
boolean previousWasPrefix,
|
||||||
boolean circumfix,
|
boolean circumfix,
|
||||||
boolean caseVariant)
|
WordCase originalCase)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
// TODO: allow this stuff to be reused by tokenfilter
|
// TODO: allow this stuff to be reused by tokenfilter
|
||||||
|
@ -371,7 +441,7 @@ final class Stemmer {
|
||||||
recursionDepth,
|
recursionDepth,
|
||||||
true,
|
true,
|
||||||
circumfix,
|
circumfix,
|
||||||
caseVariant));
|
originalCase));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -424,7 +494,7 @@ final class Stemmer {
|
||||||
recursionDepth,
|
recursionDepth,
|
||||||
false,
|
false,
|
||||||
circumfix,
|
circumfix,
|
||||||
caseVariant));
|
originalCase));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -555,7 +625,7 @@ final class Stemmer {
|
||||||
int recursionDepth,
|
int recursionDepth,
|
||||||
boolean prefix,
|
boolean prefix,
|
||||||
boolean circumfix,
|
boolean circumfix,
|
||||||
boolean caseVariant)
|
WordCase originalCase)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
|
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
|
||||||
|
|
||||||
|
@ -589,7 +659,7 @@ final class Stemmer {
|
||||||
}
|
}
|
||||||
|
|
||||||
// we are looking for a case variant, but this word does not allow it
|
// we are looking for a case variant, but this word does not allow it
|
||||||
if (!acceptCase(caseVariant, wordFlags)) {
|
if (!acceptCase(originalCase, wordFlags, strippedWord, offset, length)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
|
if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
|
||||||
|
@ -654,7 +724,7 @@ final class Stemmer {
|
||||||
true,
|
true,
|
||||||
prefix,
|
prefix,
|
||||||
circumfix,
|
circumfix,
|
||||||
caseVariant));
|
originalCase));
|
||||||
}
|
}
|
||||||
|
|
||||||
return stems;
|
return stems;
|
||||||
|
|
|
@ -28,9 +28,9 @@ enum WordCase {
|
||||||
boolean seenUpper = false;
|
boolean seenUpper = false;
|
||||||
boolean seenLower = false;
|
boolean seenLower = false;
|
||||||
for (int i = 1; i < length; i++) {
|
for (int i = 1; i < length; i++) {
|
||||||
char ch = word[i];
|
CharCase cc = charCase(word[i]);
|
||||||
seenUpper = seenUpper || Character.isUpperCase(ch);
|
seenUpper = seenUpper || cc == CharCase.UPPER;
|
||||||
seenLower = seenLower || Character.isLowerCase(ch);
|
seenLower = seenLower || cc == CharCase.LOWER;
|
||||||
if (seenUpper && seenLower) break;
|
if (seenUpper && seenLower) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -43,9 +43,9 @@ enum WordCase {
|
||||||
boolean seenUpper = false;
|
boolean seenUpper = false;
|
||||||
boolean seenLower = false;
|
boolean seenLower = false;
|
||||||
for (int i = 1; i < length; i++) {
|
for (int i = 1; i < length; i++) {
|
||||||
char ch = word.charAt(i);
|
CharCase cc = charCase(word.charAt(i));
|
||||||
seenUpper = seenUpper || Character.isUpperCase(ch);
|
seenUpper = seenUpper || cc == CharCase.UPPER;
|
||||||
seenLower = seenLower || Character.isLowerCase(ch);
|
seenLower = seenLower || cc == CharCase.LOWER;
|
||||||
if (seenUpper && seenLower) break;
|
if (seenUpper && seenLower) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -58,4 +58,20 @@ enum WordCase {
|
||||||
}
|
}
|
||||||
return seenUpper ? MIXED : LOWER;
|
return seenUpper ? MIXED : LOWER;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static CharCase charCase(char c) {
|
||||||
|
if (Character.isUpperCase(c)) {
|
||||||
|
return CharCase.UPPER;
|
||||||
|
}
|
||||||
|
if (Character.isLowerCase(c) && Character.toUpperCase(c) != c) {
|
||||||
|
return CharCase.LOWER;
|
||||||
|
}
|
||||||
|
return CharCase.NEUTRAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
private enum CharCase {
|
||||||
|
UPPER,
|
||||||
|
LOWER,
|
||||||
|
NEUTRAL
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,6 +41,11 @@ public class SpellCheckerTest extends StemmerTestBase {
|
||||||
doTest("allcaps");
|
doTest("allcaps");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void checkSharpS() throws Exception {
|
||||||
|
doTest("checksharps");
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void IJ() throws Exception {
|
public void IJ() throws Exception {
|
||||||
doTest("IJ");
|
doTest("IJ");
|
||||||
|
|
|
@ -0,0 +1,34 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
|
||||||
|
public class TestCheckSharpS extends StemmerTestBase {
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void beforeClass() throws Exception {
|
||||||
|
init("checksharps.aff", "checksharps.dic");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSharpS() {
|
||||||
|
assertStemsTo("Müßig", "müßig");
|
||||||
|
assertStemsTo("MÜSSIG", "müßig");
|
||||||
|
assertStemsTo("Müssig");
|
||||||
|
assertStemsTo("PROZESSIONSSTRASSE", "Prozessionsstraße");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,4 @@
|
||||||
|
# test <20> - SS special capitalizing
|
||||||
|
CHECKSHARPS
|
||||||
|
WORDCHARS <20>.
|
||||||
|
KEEPCASE k
|
|
@ -0,0 +1,7 @@
|
||||||
|
6
|
||||||
|
müßig/k
|
||||||
|
Ausstoß
|
||||||
|
Abstoß.
|
||||||
|
Außenabmessung
|
||||||
|
Prozessionsstraße
|
||||||
|
Außenmaße
|
|
@ -0,0 +1,13 @@
|
||||||
|
müßig
|
||||||
|
Müßig
|
||||||
|
MÜSSIG
|
||||||
|
Ausstoß
|
||||||
|
Abstoß.
|
||||||
|
Außenabmessung
|
||||||
|
Prozessionsstraße
|
||||||
|
Außenmaße
|
||||||
|
AUSSTOSS
|
||||||
|
ABSTOSS.
|
||||||
|
AUSSENABMESSUNG
|
||||||
|
PROZESSIONSSTRASSE
|
||||||
|
AUSSENMASSE
|
|
@ -0,0 +1,2 @@
|
||||||
|
MÜßIG
|
||||||
|
Müssig
|
Loading…
Reference in New Issue