LUCENE-9746: Hunspell: unify case variation logic in Stemmer and SpellChecker (#2322)

This commit is contained in:
Peter Gromov 2021-02-08 21:37:32 +01:00 committed by GitHub
parent d0b4ef66d7
commit 80803eb9ad
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 40 additions and 70 deletions

View File

@ -73,8 +73,12 @@ public class SpellChecker {
} }
WordCase wc = stemmer.caseOf(wordChars, wordChars.length); WordCase wc = stemmer.caseOf(wordChars, wordChars.length);
if ((wc == WordCase.UPPER || wc == WordCase.TITLE) && checkCaseVariants(wordChars, wc)) { if ((wc == WordCase.UPPER || wc == WordCase.TITLE)) {
return true; Stemmer.CaseVariationProcessor variationProcessor =
(variant, varLength, originalCase) -> !checkWord(variant, varLength, originalCase);
if (!stemmer.varyCase(wordChars, wordChars.length, wc, variationProcessor)) {
return true;
}
} }
if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) { if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
@ -92,42 +96,6 @@ public class SpellChecker {
return spellClean(word.substring(0, length)) || spellClean(word.substring(0, length + 1)); return spellClean(word.substring(0, length)) || spellClean(word.substring(0, length + 1));
} }
private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) {
char[] caseVariant = wordChars;
if (wordCase == WordCase.UPPER) {
caseVariant = stemmer.caseFoldTitle(caseVariant, wordChars.length);
if (checkWord(caseVariant, wordChars.length, wordCase)) {
return true;
}
char[] aposCase = Stemmer.capitalizeAfterApostrophe(caseVariant, wordChars.length);
if (aposCase != null && checkWord(aposCase, aposCase.length, wordCase)) {
return true;
}
for (char[] variation : stemmer.sharpSVariations(caseVariant, wordChars.length)) {
if (checkWord(variation, variation.length, null)) {
return true;
}
}
}
if (dictionary.isDotICaseChangeDisallowed(wordChars)) {
return false;
}
char[] lower = stemmer.caseFoldLower(caseVariant, wordChars.length);
if (checkWord(lower, wordChars.length, wordCase)) {
return true;
}
if (wordCase == WordCase.UPPER) {
for (char[] variation : stemmer.sharpSVariations(lower, wordChars.length)) {
if (checkWord(variation, variation.length, null)) {
return true;
}
}
}
return false;
}
boolean checkWord(String word) { boolean checkWord(String word) {
return checkWord(word.toCharArray(), word.length(), null); return checkWord(word.toCharArray(), word.length(), null);
} }

View File

@ -111,46 +111,47 @@ final class Stemmer {
WordCase wordCase = caseOf(word, length); WordCase wordCase = caseOf(word, length);
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) { if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
addCaseVariations(word, length, wordCase, processor); CaseVariationProcessor variationProcessor =
(variant, varLength, originalCase) ->
doStem(variant, 0, varLength, originalCase, WordContext.SIMPLE_WORD, processor);
varyCase(word, length, wordCase, variationProcessor);
} }
return list; return list;
} }
private void addCaseVariations( interface CaseVariationProcessor {
char[] word, int length, WordCase wordCase, RootProcessor processor) { boolean process(char[] word, int length, WordCase originalCase);
}
boolean varyCase(char[] word, int length, WordCase wordCase, CaseVariationProcessor processor) {
if (wordCase == WordCase.UPPER) { if (wordCase == WordCase.UPPER) {
caseFoldTitle(word, length); caseFoldTitle(word, length);
char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length); char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
if (aposCase != null) { if (aposCase != null && !processor.process(aposCase, length, wordCase)) {
if (!doStem(aposCase, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) { return false;
return;
}
} }
if (!doStem(titleBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) { if (!processor.process(titleBuffer, length, wordCase)) {
return; return false;
} }
for (char[] variation : sharpSVariations(titleBuffer, length)) { if (dictionary.checkSharpS && !varySharpS(titleBuffer, length, processor)) {
if (!doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD, processor)) { return false;
return;
}
} }
} }
if (dictionary.isDotICaseChangeDisallowed(word)) { if (dictionary.isDotICaseChangeDisallowed(word)) {
return; return true;
} }
caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length); caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
if (!doStem(lowerBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) { if (!processor.process(lowerBuffer, length, wordCase)) {
return; return false;
} }
if (wordCase == WordCase.UPPER) { if (wordCase == WordCase.UPPER
for (char[] variation : sharpSVariations(lowerBuffer, length)) { && dictionary.checkSharpS
if (!doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD, processor)) { && !varySharpS(lowerBuffer, length, processor)) {
return; return false;
}
}
} }
return true;
} }
// temporary buffers for case variants // temporary buffers for case variants
@ -167,26 +168,24 @@ final class Stemmer {
} }
/** folds titlecase variant of word to titleBuffer */ /** folds titlecase variant of word to titleBuffer */
char[] caseFoldTitle(char[] word, int length) { private void caseFoldTitle(char[] word, int length) {
titleBuffer = ArrayUtil.grow(titleBuffer, length); titleBuffer = ArrayUtil.grow(titleBuffer, length);
System.arraycopy(word, 0, titleBuffer, 0, length); System.arraycopy(word, 0, titleBuffer, 0, length);
for (int i = 1; i < length; i++) { for (int i = 1; i < length; i++) {
titleBuffer[i] = dictionary.caseFold(titleBuffer[i]); titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
} }
return titleBuffer;
} }
/** folds lowercase variant of word (title cased) to lowerBuffer */ /** folds lowercase variant of word (title cased) to lowerBuffer */
char[] caseFoldLower(char[] word, int length) { private void caseFoldLower(char[] word, int length) {
lowerBuffer = ArrayUtil.grow(lowerBuffer, length); lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
System.arraycopy(word, 0, lowerBuffer, 0, length); System.arraycopy(word, 0, lowerBuffer, 0, length);
lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]); lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
return lowerBuffer;
} }
// Special prefix handling for Catalan, French, Italian: // Special prefix handling for Catalan, French, Italian:
// prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia). // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
static char[] capitalizeAfterApostrophe(char[] word, int length) { private static char[] capitalizeAfterApostrophe(char[] word, int length) {
for (int i = 1; i < length - 1; i++) { for (int i = 1; i < length - 1; i++) {
if (word[i] == '\'') { if (word[i] == '\'') {
char next = word[i + 1]; char next = word[i + 1];
@ -201,9 +200,7 @@ final class Stemmer {
return null; return null;
} }
List<char[]> sharpSVariations(char[] word, int length) { private boolean varySharpS(char[] word, int length, CaseVariationProcessor processor) {
if (!dictionary.checkSharpS) return Collections.emptyList();
Stream<String> result = Stream<String> result =
new Object() { new Object() {
int findSS(int start) { int findSS(int start) {
@ -233,10 +230,15 @@ final class Stemmer {
} }
} }
}.replaceSS(0, 0); }.replaceSS(0, 0);
if (result == null) return Collections.emptyList(); if (result == null) return true;
String src = new String(word, 0, length); String src = new String(word, 0, length);
return result.filter(s -> !s.equals(src)).map(String::toCharArray).collect(Collectors.toList()); for (String s : result.collect(Collectors.toList())) {
if (!s.equals(src) && !processor.process(s.toCharArray(), s.length(), null)) {
return false;
}
}
return true;
} }
boolean doStem( boolean doStem(