diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java index 44867bc5043..ac5208c1868 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java @@ -16,7 +16,6 @@ */ package org.apache.lucene.analysis.hunspell; -import java.util.List; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.IntsRef; @@ -58,10 +57,9 @@ class CheckCompoundPattern { } boolean prohibitsCompounding( - CharsRef word, int breakPos, List stemsBefore, List stemsAfter) { + CharsRef word, int breakPos, CharsRef stemBefore, CharsRef stemAfter) { if (isNonAffixedPattern(endChars)) { - if (stemsBefore.stream() - .noneMatch(stem -> charsMatch(word, breakPos - stem.length, stem.chars))) { + if (!charsMatch(word, breakPos - stemBefore.length, stemBefore.chars)) { return false; } } else if (!charsMatch(word, breakPos - endChars.length, endChars)) { @@ -69,18 +67,18 @@ class CheckCompoundPattern { } if (isNonAffixedPattern(beginChars)) { - if (stemsAfter.stream().noneMatch(stem -> charsMatch(word, breakPos, stem.chars))) { + if (!charsMatch(word, breakPos, stemAfter.chars)) { return false; } } else if (!charsMatch(word, breakPos, beginChars)) { return false; } - if (endFlags.length > 0 && !hasStemWithFlags(stemsBefore, endFlags)) { + if (endFlags.length > 0 && !stemHasFlags(stemBefore, endFlags)) { return false; } //noinspection RedundantIfStatement - if (beginFlags.length > 0 && !hasStemWithFlags(stemsAfter, beginFlags)) { + if (beginFlags.length > 0 && !stemHasFlags(stemAfter, beginFlags)) { return false; } @@ -91,14 +89,9 @@ class CheckCompoundPattern { return pattern.length == 1 && pattern[0] == '0'; } - private boolean hasStemWithFlags(List stems, char[] flags) { - for (CharsRef stem : stems) { - IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length); - if (forms != null && hasAllFlags(flags, forms)) { - return true; - } - } - return false; + private boolean stemHasFlags(CharsRef stem, char[] flags) { + IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length); + return forms != null && hasAllFlags(flags, forms); } private boolean hasAllFlags(char[] flags, IntsRef forms) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java index d69940c0562..e59dee20419 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java @@ -140,7 +140,7 @@ public class SpellChecker { return false; } - if (!stemmer.doStem(wordChars, 0, length, originalCase, SIMPLE_WORD).isEmpty()) { + if (findStem(wordChars, 0, length, originalCase, SIMPLE_WORD) != null) { return true; } @@ -156,8 +156,24 @@ public class SpellChecker { return false; } + private CharsRef findStem( + char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) { + CharsRef[] result = {null}; + stemmer.doStem( + wordChars, + offset, + length, + originalCase, + context, + (stem, forms, formID) -> { + result[0] = stem; + return false; + }); + return result[0]; + } + private boolean checkCompounds( - CharsRef word, WordCase originalCase, int depth, Predicate> checkPatterns) { + CharsRef word, WordCase originalCase, int depth, Predicate checkPatterns) { if (depth > dictionary.compoundMax - 2) return false; int limit = word.length - dictionary.compoundMin + 1; @@ -165,16 +181,15 @@ public class SpellChecker { WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE; int breakOffset = word.offset + breakPos; if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) { - List stems = - stemmer.doStem(word.chars, word.offset, breakPos, originalCase, context); - if (stems.isEmpty() + CharsRef stem = findStem(word.chars, word.offset, breakPos, originalCase, context); + if (stem == null && dictionary.simplifiedTriple && word.chars[breakOffset - 1] == word.chars[breakOffset]) { - stems = stemmer.doStem(word.chars, word.offset, breakPos + 1, originalCase, context); + stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context); } - if (!stems.isEmpty() && checkPatterns.test(stems)) { - Predicate> nextCheck = checkNextPatterns(word, breakPos, stems); - if (checkCompoundsAfter(word, breakPos, originalCase, depth, stems, nextCheck)) { + if (stem != null && checkPatterns.test(stem)) { + Predicate nextCheck = checkNextPatterns(word, breakPos, stem); + if (checkCompoundsAfter(word, breakPos, originalCase, depth, stem, nextCheck)) { return true; } } @@ -195,12 +210,11 @@ public class SpellChecker { if (expanded != null) { WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE; int breakPos = pos + pattern.endLength(); - List stems = - stemmer.doStem(expanded.chars, expanded.offset, breakPos, originalCase, context); - if (!stems.isEmpty()) { - Predicate> nextCheck = - next -> pattern.prohibitsCompounding(expanded, breakPos, stems, next); - if (checkCompoundsAfter(expanded, breakPos, originalCase, depth, stems, nextCheck)) { + CharsRef stem = findStem(expanded.chars, expanded.offset, breakPos, originalCase, context); + if (stem != null) { + Predicate nextCheck = + next -> pattern.prohibitsCompounding(expanded, breakPos, stem, next); + if (checkCompoundsAfter(expanded, breakPos, originalCase, depth, stem, nextCheck)) { return true; } } @@ -209,11 +223,10 @@ public class SpellChecker { return false; } - private Predicate> checkNextPatterns( - CharsRef word, int breakPos, List stems) { - return nextStems -> + private Predicate checkNextPatterns(CharsRef word, int breakPos, CharsRef stems) { + return nextStem -> dictionary.checkCompoundPatterns.stream() - .noneMatch(p -> p.prohibitsCompounding(word, breakPos, stems, nextStems)); + .noneMatch(p -> p.prohibitsCompounding(word, breakPos, stems, nextStem)); } private boolean checkCompoundsAfter( @@ -221,16 +234,16 @@ public class SpellChecker { int breakPos, WordCase originalCase, int depth, - List prevStems, - Predicate> checkPatterns) { + CharsRef prevStem, + Predicate checkPatterns) { int remainingLength = word.length - breakPos; int breakOffset = word.offset + breakPos; - List tailStems = - stemmer.doStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END); - if (!tailStems.isEmpty() - && !(dictionary.checkCompoundDup && intersectIgnoreCase(prevStems, tailStems)) + CharsRef tailStem = + findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END); + if (tailStem != null + && !(dictionary.checkCompoundDup && equalsIgnoreCase(prevStem, tailStem)) && !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase) - && checkPatterns.test(tailStems)) { + && checkPatterns.test(tailStem)) { return true; } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index 729371d0745..44e26751f75 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -100,24 +100,41 @@ final class Stemmer { return Collections.emptyList(); } + List list = new ArrayList<>(); + RootProcessor processor = + (stem, forms, formID) -> { + list.add(newStem(stem, forms, formID)); + return true; + }; + + if (!doStem(word, 0, length, null, WordContext.SIMPLE_WORD, processor)) { + return list; + } + WordCase wordCase = caseOf(word, length); - List list = doStem(word, 0, length, null, WordContext.SIMPLE_WORD); if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) { - addCaseVariations(word, length, wordCase, list); + addCaseVariations(word, length, wordCase, processor); } return list; } - private void addCaseVariations(char[] word, int length, WordCase wordCase, List list) { + private void addCaseVariations( + char[] word, int length, WordCase wordCase, RootProcessor processor) { if (wordCase == WordCase.UPPER) { caseFoldTitle(word, length); char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length); if (aposCase != null) { - list.addAll(doStem(aposCase, 0, length, wordCase, WordContext.SIMPLE_WORD)); + if (!doStem(aposCase, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) { + return; + } + } + if (!doStem(titleBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) { + return; } - list.addAll(doStem(titleBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD)); for (char[] variation : sharpSVariations(titleBuffer, length)) { - list.addAll(doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD)); + if (!doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD, processor)) { + return; + } } } @@ -126,10 +143,14 @@ final class Stemmer { } caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length); - list.addAll(doStem(lowerBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD)); + if (!doStem(lowerBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) { + return; + } if (wordCase == WordCase.UPPER) { for (char[] variation : sharpSVariations(lowerBuffer, length)) { - list.addAll(doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD)); + if (!doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD, processor)) { + return; + } } } } @@ -220,9 +241,13 @@ final class Stemmer { return result.filter(s -> !s.equals(src)).map(String::toCharArray).collect(Collectors.toList()); } - List doStem( - char[] word, int offset, int length, WordCase originalCase, WordContext context) { - List stems = new ArrayList<>(); + boolean doStem( + char[] word, + int offset, + int length, + WordCase originalCase, + WordContext context, + RootProcessor processor) { IntsRef forms = dictionary.lookupWord(word, offset, length); if (forms != null) { for (int i = 0; i < forms.length; i += formStep) { @@ -241,36 +266,37 @@ final class Stemmer { if (context.isCompound()) { if (context != WordContext.COMPOUND_END && Dictionary.hasFlag(wordFlags, dictionary.compoundForbid)) { - return new ArrayList<>(); + return false; } if (!Dictionary.hasFlag(wordFlags, dictionary.compoundFlag) && !Dictionary.hasFlag(wordFlags, context.requiredFlag(dictionary))) { continue; } } - stems.add(newStem(word, offset, length, forms, i)); + if (!processor.processRoot(new CharsRef(word, offset, length), forms, i)) { + return false; + } } } try { - stems.addAll( - stem( - word, - offset, - length, - context, - -1, - Dictionary.FLAG_UNSET, - -1, - 0, - true, - true, - false, - false, - originalCase)); + return stem( + word, + offset, + length, + context, + -1, + Dictionary.FLAG_UNSET, + -1, + 0, + true, + true, + false, + false, + originalCase, + processor); } catch (IOException bogus) { throw new RuntimeException(bogus); } - return stems; } private boolean acceptCase( @@ -319,7 +345,12 @@ final class Stemmer { return deduped; } - private CharsRef newStem(char[] buffer, int offset, int length, IntsRef forms, int formID) { + interface RootProcessor { + /** @return whether the processing should be continued */ + boolean processRoot(CharsRef stem, IntsRef forms, int formID); + } + + private CharsRef newStem(CharsRef stem, IntsRef forms, int formID) { final String exception; if (dictionary.hasStemExceptions) { int exceptionID = forms.ints[forms.offset + formID + 1]; @@ -337,7 +368,7 @@ final class Stemmer { if (exception != null) { scratchSegment.append(exception); } else { - scratchSegment.append(buffer, offset, length); + scratchSegment.append(stem.chars, stem.offset, stem.length); } try { Dictionary.applyMappings(dictionary.oconv, scratchSegment); @@ -351,7 +382,7 @@ final class Stemmer { if (exception != null) { return new CharsRef(exception); } else { - return new CharsRef(buffer, offset, length); + return stem; } } } @@ -385,9 +416,9 @@ final class Stemmer { * most suffix must also contain circumfix flag. * @param originalCase if non-null, represents original word case to disallow case variations of * word with KEEPCASE flags - * @return List of stems, or empty list if no stems are found + * @return whether the processing should be continued */ - private List stem( + private boolean stem( char[] word, int offset, int length, @@ -400,12 +431,9 @@ final class Stemmer { boolean doSuffix, boolean previousWasPrefix, boolean circumfix, - WordCase originalCase) + WordCase originalCase, + RootProcessor processor) throws IOException { - - // TODO: allow this stuff to be reused by tokenfilter - List stems = new ArrayList<>(); - if (doPrefix && dictionary.prefixes != null) { FST fst = dictionary.prefixes; FST.Arc arc = prefixArcs[recursionDepth]; @@ -440,19 +468,21 @@ final class Stemmer { } boolean pureAffix = strippedWord == word; - stems.addAll( - applyAffix( - strippedWord, - pureAffix ? offset + i : 0, - pureAffix ? length - i : strippedWord.length, - context, - prefix, - previous, - -1, - recursionDepth, - true, - circumfix, - originalCase)); + if (!applyAffix( + strippedWord, + pureAffix ? offset + i : 0, + pureAffix ? length - i : strippedWord.length, + context, + prefix, + previous, + -1, + recursionDepth, + true, + circumfix, + originalCase, + processor)) { + return false; + } } } } @@ -493,25 +523,27 @@ final class Stemmer { } boolean pureAffix = strippedWord == word; - stems.addAll( - applyAffix( - strippedWord, - pureAffix ? offset : 0, - pureAffix ? i : strippedWord.length, - context, - suffix, - previous, - prefixId, - recursionDepth, - false, - circumfix, - originalCase)); + if (!applyAffix( + strippedWord, + pureAffix ? offset : 0, + pureAffix ? i : strippedWord.length, + context, + suffix, + previous, + prefixId, + recursionDepth, + false, + circumfix, + originalCase, + processor)) { + return false; + } } } } } - return stems; + return true; } /** @@ -632,9 +664,9 @@ final class Stemmer { * as a stem! * @param recursionDepth current recursion depth * @param prefix true if we are removing a prefix (false if it's a suffix) - * @return List of stems for the word, or an empty list if none are found + * @return whether the processing should be continued */ - private List applyAffix( + private boolean applyAffix( char[] strippedWord, int offset, int length, @@ -645,12 +677,11 @@ final class Stemmer { int recursionDepth, boolean prefix, boolean circumfix, - WordCase originalCase) + WordCase originalCase, + RootProcessor processor) throws IOException { char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG); - List stems = new ArrayList<>(); - boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix); IntsRef forms = skipLookup ? null : dictionary.lookupWord(strippedWord, offset, length); if (forms != null) { @@ -694,7 +725,9 @@ final class Stemmer { continue; } } - stems.add(newStem(strippedWord, offset, length, forms, i)); + if (!processor.processRoot(new CharsRef(strippedWord, offset, length), forms, i)) { + return false; + } } } } @@ -720,7 +753,7 @@ final class Stemmer { // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed // COMPLEXPREFIXES = false: combine with another suffix } else { - return stems; + return true; } } else { doPrefix = false; @@ -728,29 +761,29 @@ final class Stemmer { prefixId = affix; // we took away the second prefix: go look for another suffix } else if (prefix || dictionary.complexPrefixes || !dictionary.twoStageAffix) { - return stems; + return true; } // we took away a prefix, then a suffix: go look for another suffix } - stems.addAll( - stem( - strippedWord, - offset, - length, - context, - affix, - flag, - prefixId, - recursionDepth + 1, - doPrefix, - true, - prefix, - circumfix, - originalCase)); + return stem( + strippedWord, + offset, + length, + context, + affix, + flag, + prefixId, + recursionDepth + 1, + doPrefix, + true, + prefix, + circumfix, + originalCase, + processor); } - return stems; + return true; } private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix) {