LUCENE-9771: Hunspell: don't lookup word roots unnecessarily to check flags (#2369)

This commit is contained in:
Peter Gromov 2021-02-15 20:21:44 +01:00 committed by GitHub
parent 1ff11dd02c
commit ef920388e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 22 additions and 36 deletions

View File

@ -17,7 +17,6 @@
package org.apache.lucene.analysis.hunspell;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
class CheckCompoundPattern {
private final String endChars;
@ -51,10 +50,9 @@ class CheckCompoundPattern {
return endChars + " " + beginChars + (replacement == null ? "" : " -> " + replacement);
}
boolean prohibitsCompounding(
CharsRef word, int breakPos, CharsRef stemBefore, CharsRef stemAfter) {
boolean prohibitsCompounding(CharsRef word, int breakPos, Root<?> rootBefore, Root<?> rootAfter) {
if (isNonAffixedPattern(endChars)) {
if (!charsMatch(word, breakPos - stemBefore.length, stemBefore)) {
if (!charsMatch(word, breakPos - rootBefore.word.length(), rootBefore.word)) {
return false;
}
} else if (!charsMatch(word, breakPos - endChars.length(), endChars)) {
@ -62,18 +60,18 @@ class CheckCompoundPattern {
}
if (isNonAffixedPattern(beginChars)) {
if (!charsMatch(word, breakPos, stemAfter)) {
if (!charsMatch(word, breakPos, rootAfter.word)) {
return false;
}
} else if (!charsMatch(word, breakPos, beginChars)) {
return false;
}
if (endFlags.length > 0 && !stemHasFlags(stemBefore, endFlags)) {
if (endFlags.length > 0 && !hasAllFlags(rootBefore, endFlags)) {
return false;
}
//noinspection RedundantIfStatement
if (beginFlags.length > 0 && !stemHasFlags(stemAfter, beginFlags)) {
if (beginFlags.length > 0 && !hasAllFlags(rootAfter, beginFlags)) {
return false;
}
@ -84,14 +82,9 @@ class CheckCompoundPattern {
return pattern.length() == 1 && pattern.charAt(0) == '0';
}
private boolean stemHasFlags(CharsRef stem, char[] flags) {
IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length);
return forms != null && hasAllFlags(flags, forms);
}
private boolean hasAllFlags(char[] flags, IntsRef forms) {
private boolean hasAllFlags(Root<?> root, char[] flags) {
for (char flag : flags) {
if (!dictionary.hasFlag(forms, flag)) {
if (!dictionary.hasFlag(root.entryId, flag)) {
return false;
}
}

View File

@ -226,13 +226,13 @@ public class Hunspell {
int breakPos = prev.length;
int remainingLength = word.length - breakPos;
int breakOffset = word.offset + breakPos;
Root<CharsRef> tailStem =
Root<CharsRef> lastRoot =
findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
if (tailStem != null
&& !dictionary.hasFlag(tailStem.entryId, dictionary.forbiddenword)
&& !(dictionary.checkCompoundDup && equalsIgnoreCase(prev.stem, tailStem.word))
&& !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase)
&& prev.mayCompound(tailStem, remainingLength, originalCase)) {
if (lastRoot != null
&& !dictionary.hasFlag(lastRoot.entryId, dictionary.forbiddenword)
&& !(dictionary.checkCompoundDup && prev.root.equals(lastRoot))
&& !hasForceUCaseProblem(lastRoot, originalCase)
&& prev.mayCompound(lastRoot, remainingLength, originalCase)) {
return true;
}
@ -240,17 +240,9 @@ public class Hunspell {
return checkCompounds(tail, originalCase, prev);
}
private boolean hasForceUCaseProblem(
char[] chars, int offset, int length, WordCase originalCase) {
if (dictionary.forceUCase == FLAG_UNSET) return false;
private boolean hasForceUCaseProblem(Root<?> root, WordCase originalCase) {
if (originalCase == WordCase.TITLE || originalCase == WordCase.UPPER) return false;
IntsRef forms = dictionary.lookupWord(chars, offset, length);
return forms != null && dictionary.hasFlag(forms, dictionary.forceUCase);
}
private boolean equalsIgnoreCase(CharSequence cr1, CharSequence cr2) {
return cr1.toString().equalsIgnoreCase(cr2.toString());
return dictionary.hasFlag(root.entryId, dictionary.forceUCase);
}
/**
@ -274,19 +266,20 @@ public class Hunspell {
private class CompoundPart {
final CompoundPart prev;
final int index, length;
final CharsRef tail, stem;
final CharsRef tail;
final Root<CharsRef> root;
final CheckCompoundPattern enablingPattern;
CompoundPart(
CompoundPart prev,
CharsRef tail,
int length,
Root<CharsRef> stem,
Root<CharsRef> root,
CheckCompoundPattern enabler) {
this.prev = prev;
this.tail = tail;
this.length = length;
this.stem = stem.word;
this.root = root;
index = prev == null ? 1 : prev.index + 1;
enablingPattern = enabler;
}
@ -296,12 +289,12 @@ public class Hunspell {
return (prev == null ? "" : prev + "+") + tail.subSequence(0, length);
}
boolean mayCompound(Root<CharsRef> nextStem, int nextPartLength, WordCase originalCase) {
boolean mayCompound(Root<CharsRef> nextRoot, int nextPartLength, WordCase originalCase) {
boolean patternsOk =
enablingPattern != null
? enablingPattern.prohibitsCompounding(tail, length, stem, nextStem.word)
? enablingPattern.prohibitsCompounding(tail, length, root, nextRoot)
: dictionary.checkCompoundPatterns.stream()
.noneMatch(p -> p.prohibitsCompounding(tail, length, stem, nextStem.word));
.noneMatch(p -> p.prohibitsCompounding(tail, length, root, nextRoot));
if (!patternsOk) {
return false;
}