mirror of https://github.com/apache/lucene.git
LUCENE-9746: Hunspell: unify case variation logic in Stemmer and SpellChecker (#2322)
This commit is contained in:
parent
d0b4ef66d7
commit
80803eb9ad
|
@ -73,8 +73,12 @@ public class SpellChecker {
|
||||||
}
|
}
|
||||||
|
|
||||||
WordCase wc = stemmer.caseOf(wordChars, wordChars.length);
|
WordCase wc = stemmer.caseOf(wordChars, wordChars.length);
|
||||||
if ((wc == WordCase.UPPER || wc == WordCase.TITLE) && checkCaseVariants(wordChars, wc)) {
|
if ((wc == WordCase.UPPER || wc == WordCase.TITLE)) {
|
||||||
return true;
|
Stemmer.CaseVariationProcessor variationProcessor =
|
||||||
|
(variant, varLength, originalCase) -> !checkWord(variant, varLength, originalCase);
|
||||||
|
if (!stemmer.varyCase(wordChars, wordChars.length, wc, variationProcessor)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
|
if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
|
||||||
|
@ -92,42 +96,6 @@ public class SpellChecker {
|
||||||
return spellClean(word.substring(0, length)) || spellClean(word.substring(0, length + 1));
|
return spellClean(word.substring(0, length)) || spellClean(word.substring(0, length + 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) {
|
|
||||||
char[] caseVariant = wordChars;
|
|
||||||
if (wordCase == WordCase.UPPER) {
|
|
||||||
caseVariant = stemmer.caseFoldTitle(caseVariant, wordChars.length);
|
|
||||||
if (checkWord(caseVariant, wordChars.length, wordCase)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
char[] aposCase = Stemmer.capitalizeAfterApostrophe(caseVariant, wordChars.length);
|
|
||||||
if (aposCase != null && checkWord(aposCase, aposCase.length, wordCase)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
for (char[] variation : stemmer.sharpSVariations(caseVariant, wordChars.length)) {
|
|
||||||
if (checkWord(variation, variation.length, null)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (dictionary.isDotICaseChangeDisallowed(wordChars)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
char[] lower = stemmer.caseFoldLower(caseVariant, wordChars.length);
|
|
||||||
if (checkWord(lower, wordChars.length, wordCase)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (wordCase == WordCase.UPPER) {
|
|
||||||
for (char[] variation : stemmer.sharpSVariations(lower, wordChars.length)) {
|
|
||||||
if (checkWord(variation, variation.length, null)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean checkWord(String word) {
|
boolean checkWord(String word) {
|
||||||
return checkWord(word.toCharArray(), word.length(), null);
|
return checkWord(word.toCharArray(), word.length(), null);
|
||||||
}
|
}
|
||||||
|
|
|
@ -111,46 +111,47 @@ final class Stemmer {
|
||||||
|
|
||||||
WordCase wordCase = caseOf(word, length);
|
WordCase wordCase = caseOf(word, length);
|
||||||
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
|
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
|
||||||
addCaseVariations(word, length, wordCase, processor);
|
CaseVariationProcessor variationProcessor =
|
||||||
|
(variant, varLength, originalCase) ->
|
||||||
|
doStem(variant, 0, varLength, originalCase, WordContext.SIMPLE_WORD, processor);
|
||||||
|
varyCase(word, length, wordCase, variationProcessor);
|
||||||
}
|
}
|
||||||
return list;
|
return list;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addCaseVariations(
|
interface CaseVariationProcessor {
|
||||||
char[] word, int length, WordCase wordCase, RootProcessor processor) {
|
boolean process(char[] word, int length, WordCase originalCase);
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean varyCase(char[] word, int length, WordCase wordCase, CaseVariationProcessor processor) {
|
||||||
if (wordCase == WordCase.UPPER) {
|
if (wordCase == WordCase.UPPER) {
|
||||||
caseFoldTitle(word, length);
|
caseFoldTitle(word, length);
|
||||||
char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
|
char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
|
||||||
if (aposCase != null) {
|
if (aposCase != null && !processor.process(aposCase, length, wordCase)) {
|
||||||
if (!doStem(aposCase, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) {
|
return false;
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (!doStem(titleBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) {
|
if (!processor.process(titleBuffer, length, wordCase)) {
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
for (char[] variation : sharpSVariations(titleBuffer, length)) {
|
if (dictionary.checkSharpS && !varySharpS(titleBuffer, length, processor)) {
|
||||||
if (!doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD, processor)) {
|
return false;
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dictionary.isDotICaseChangeDisallowed(word)) {
|
if (dictionary.isDotICaseChangeDisallowed(word)) {
|
||||||
return;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
|
caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
|
||||||
if (!doStem(lowerBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) {
|
if (!processor.process(lowerBuffer, length, wordCase)) {
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
if (wordCase == WordCase.UPPER) {
|
if (wordCase == WordCase.UPPER
|
||||||
for (char[] variation : sharpSVariations(lowerBuffer, length)) {
|
&& dictionary.checkSharpS
|
||||||
if (!doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD, processor)) {
|
&& !varySharpS(lowerBuffer, length, processor)) {
|
||||||
return;
|
return false;
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// temporary buffers for case variants
|
// temporary buffers for case variants
|
||||||
|
@ -167,26 +168,24 @@ final class Stemmer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** folds titlecase variant of word to titleBuffer */
|
/** folds titlecase variant of word to titleBuffer */
|
||||||
char[] caseFoldTitle(char[] word, int length) {
|
private void caseFoldTitle(char[] word, int length) {
|
||||||
titleBuffer = ArrayUtil.grow(titleBuffer, length);
|
titleBuffer = ArrayUtil.grow(titleBuffer, length);
|
||||||
System.arraycopy(word, 0, titleBuffer, 0, length);
|
System.arraycopy(word, 0, titleBuffer, 0, length);
|
||||||
for (int i = 1; i < length; i++) {
|
for (int i = 1; i < length; i++) {
|
||||||
titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
|
titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
|
||||||
}
|
}
|
||||||
return titleBuffer;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** folds lowercase variant of word (title cased) to lowerBuffer */
|
/** folds lowercase variant of word (title cased) to lowerBuffer */
|
||||||
char[] caseFoldLower(char[] word, int length) {
|
private void caseFoldLower(char[] word, int length) {
|
||||||
lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
|
lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
|
||||||
System.arraycopy(word, 0, lowerBuffer, 0, length);
|
System.arraycopy(word, 0, lowerBuffer, 0, length);
|
||||||
lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
|
lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
|
||||||
return lowerBuffer;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Special prefix handling for Catalan, French, Italian:
|
// Special prefix handling for Catalan, French, Italian:
|
||||||
// prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
|
// prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
|
||||||
static char[] capitalizeAfterApostrophe(char[] word, int length) {
|
private static char[] capitalizeAfterApostrophe(char[] word, int length) {
|
||||||
for (int i = 1; i < length - 1; i++) {
|
for (int i = 1; i < length - 1; i++) {
|
||||||
if (word[i] == '\'') {
|
if (word[i] == '\'') {
|
||||||
char next = word[i + 1];
|
char next = word[i + 1];
|
||||||
|
@ -201,9 +200,7 @@ final class Stemmer {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
List<char[]> sharpSVariations(char[] word, int length) {
|
private boolean varySharpS(char[] word, int length, CaseVariationProcessor processor) {
|
||||||
if (!dictionary.checkSharpS) return Collections.emptyList();
|
|
||||||
|
|
||||||
Stream<String> result =
|
Stream<String> result =
|
||||||
new Object() {
|
new Object() {
|
||||||
int findSS(int start) {
|
int findSS(int start) {
|
||||||
|
@ -233,10 +230,15 @@ final class Stemmer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}.replaceSS(0, 0);
|
}.replaceSS(0, 0);
|
||||||
if (result == null) return Collections.emptyList();
|
if (result == null) return true;
|
||||||
|
|
||||||
String src = new String(word, 0, length);
|
String src = new String(word, 0, length);
|
||||||
return result.filter(s -> !s.equals(src)).map(String::toCharArray).collect(Collectors.toList());
|
for (String s : result.collect(Collectors.toList())) {
|
||||||
|
if (!s.equals(src) && !processor.process(s.toCharArray(), s.length(), null)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean doStem(
|
boolean doStem(
|
||||||
|
|
Loading…
Reference in New Issue