mirror of https://github.com/apache/lucene.git
LUCENE-9771: Hunspell: don't lookup word roots unnecessarily to check flags (#2369)
This commit is contained in:
parent
1ff11dd02c
commit
ef920388e6
|
@ -17,7 +17,6 @@
|
|||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
class CheckCompoundPattern {
|
||||
private final String endChars;
|
||||
|
@ -51,10 +50,9 @@ class CheckCompoundPattern {
|
|||
return endChars + " " + beginChars + (replacement == null ? "" : " -> " + replacement);
|
||||
}
|
||||
|
||||
boolean prohibitsCompounding(
|
||||
CharsRef word, int breakPos, CharsRef stemBefore, CharsRef stemAfter) {
|
||||
boolean prohibitsCompounding(CharsRef word, int breakPos, Root<?> rootBefore, Root<?> rootAfter) {
|
||||
if (isNonAffixedPattern(endChars)) {
|
||||
if (!charsMatch(word, breakPos - stemBefore.length, stemBefore)) {
|
||||
if (!charsMatch(word, breakPos - rootBefore.word.length(), rootBefore.word)) {
|
||||
return false;
|
||||
}
|
||||
} else if (!charsMatch(word, breakPos - endChars.length(), endChars)) {
|
||||
|
@ -62,18 +60,18 @@ class CheckCompoundPattern {
|
|||
}
|
||||
|
||||
if (isNonAffixedPattern(beginChars)) {
|
||||
if (!charsMatch(word, breakPos, stemAfter)) {
|
||||
if (!charsMatch(word, breakPos, rootAfter.word)) {
|
||||
return false;
|
||||
}
|
||||
} else if (!charsMatch(word, breakPos, beginChars)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (endFlags.length > 0 && !stemHasFlags(stemBefore, endFlags)) {
|
||||
if (endFlags.length > 0 && !hasAllFlags(rootBefore, endFlags)) {
|
||||
return false;
|
||||
}
|
||||
//noinspection RedundantIfStatement
|
||||
if (beginFlags.length > 0 && !stemHasFlags(stemAfter, beginFlags)) {
|
||||
if (beginFlags.length > 0 && !hasAllFlags(rootAfter, beginFlags)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -84,14 +82,9 @@ class CheckCompoundPattern {
|
|||
return pattern.length() == 1 && pattern.charAt(0) == '0';
|
||||
}
|
||||
|
||||
private boolean stemHasFlags(CharsRef stem, char[] flags) {
|
||||
IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length);
|
||||
return forms != null && hasAllFlags(flags, forms);
|
||||
}
|
||||
|
||||
private boolean hasAllFlags(char[] flags, IntsRef forms) {
|
||||
private boolean hasAllFlags(Root<?> root, char[] flags) {
|
||||
for (char flag : flags) {
|
||||
if (!dictionary.hasFlag(forms, flag)) {
|
||||
if (!dictionary.hasFlag(root.entryId, flag)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -226,13 +226,13 @@ public class Hunspell {
|
|||
int breakPos = prev.length;
|
||||
int remainingLength = word.length - breakPos;
|
||||
int breakOffset = word.offset + breakPos;
|
||||
Root<CharsRef> tailStem =
|
||||
Root<CharsRef> lastRoot =
|
||||
findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
|
||||
if (tailStem != null
|
||||
&& !dictionary.hasFlag(tailStem.entryId, dictionary.forbiddenword)
|
||||
&& !(dictionary.checkCompoundDup && equalsIgnoreCase(prev.stem, tailStem.word))
|
||||
&& !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase)
|
||||
&& prev.mayCompound(tailStem, remainingLength, originalCase)) {
|
||||
if (lastRoot != null
|
||||
&& !dictionary.hasFlag(lastRoot.entryId, dictionary.forbiddenword)
|
||||
&& !(dictionary.checkCompoundDup && prev.root.equals(lastRoot))
|
||||
&& !hasForceUCaseProblem(lastRoot, originalCase)
|
||||
&& prev.mayCompound(lastRoot, remainingLength, originalCase)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -240,17 +240,9 @@ public class Hunspell {
|
|||
return checkCompounds(tail, originalCase, prev);
|
||||
}
|
||||
|
||||
private boolean hasForceUCaseProblem(
|
||||
char[] chars, int offset, int length, WordCase originalCase) {
|
||||
if (dictionary.forceUCase == FLAG_UNSET) return false;
|
||||
private boolean hasForceUCaseProblem(Root<?> root, WordCase originalCase) {
|
||||
if (originalCase == WordCase.TITLE || originalCase == WordCase.UPPER) return false;
|
||||
|
||||
IntsRef forms = dictionary.lookupWord(chars, offset, length);
|
||||
return forms != null && dictionary.hasFlag(forms, dictionary.forceUCase);
|
||||
}
|
||||
|
||||
private boolean equalsIgnoreCase(CharSequence cr1, CharSequence cr2) {
|
||||
return cr1.toString().equalsIgnoreCase(cr2.toString());
|
||||
return dictionary.hasFlag(root.entryId, dictionary.forceUCase);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -274,19 +266,20 @@ public class Hunspell {
|
|||
private class CompoundPart {
|
||||
final CompoundPart prev;
|
||||
final int index, length;
|
||||
final CharsRef tail, stem;
|
||||
final CharsRef tail;
|
||||
final Root<CharsRef> root;
|
||||
final CheckCompoundPattern enablingPattern;
|
||||
|
||||
CompoundPart(
|
||||
CompoundPart prev,
|
||||
CharsRef tail,
|
||||
int length,
|
||||
Root<CharsRef> stem,
|
||||
Root<CharsRef> root,
|
||||
CheckCompoundPattern enabler) {
|
||||
this.prev = prev;
|
||||
this.tail = tail;
|
||||
this.length = length;
|
||||
this.stem = stem.word;
|
||||
this.root = root;
|
||||
index = prev == null ? 1 : prev.index + 1;
|
||||
enablingPattern = enabler;
|
||||
}
|
||||
|
@ -296,12 +289,12 @@ public class Hunspell {
|
|||
return (prev == null ? "" : prev + "+") + tail.subSequence(0, length);
|
||||
}
|
||||
|
||||
boolean mayCompound(Root<CharsRef> nextStem, int nextPartLength, WordCase originalCase) {
|
||||
boolean mayCompound(Root<CharsRef> nextRoot, int nextPartLength, WordCase originalCase) {
|
||||
boolean patternsOk =
|
||||
enablingPattern != null
|
||||
? enablingPattern.prohibitsCompounding(tail, length, stem, nextStem.word)
|
||||
? enablingPattern.prohibitsCompounding(tail, length, root, nextRoot)
|
||||
: dictionary.checkCompoundPatterns.stream()
|
||||
.noneMatch(p -> p.prohibitsCompounding(tail, length, stem, nextStem.word));
|
||||
.noneMatch(p -> p.prohibitsCompounding(tail, length, root, nextRoot));
|
||||
if (!patternsOk) {
|
||||
return false;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue