mirror of https://github.com/apache/lucene.git
LUCENE-9771: Hunspell: don't lookup word roots unnecessarily to check flags (#2369)
This commit is contained in:
parent
1ff11dd02c
commit
ef920388e6
|
@ -17,7 +17,6 @@
|
||||||
package org.apache.lucene.analysis.hunspell;
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
import org.apache.lucene.util.CharsRef;
|
import org.apache.lucene.util.CharsRef;
|
||||||
import org.apache.lucene.util.IntsRef;
|
|
||||||
|
|
||||||
class CheckCompoundPattern {
|
class CheckCompoundPattern {
|
||||||
private final String endChars;
|
private final String endChars;
|
||||||
|
@ -51,10 +50,9 @@ class CheckCompoundPattern {
|
||||||
return endChars + " " + beginChars + (replacement == null ? "" : " -> " + replacement);
|
return endChars + " " + beginChars + (replacement == null ? "" : " -> " + replacement);
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean prohibitsCompounding(
|
boolean prohibitsCompounding(CharsRef word, int breakPos, Root<?> rootBefore, Root<?> rootAfter) {
|
||||||
CharsRef word, int breakPos, CharsRef stemBefore, CharsRef stemAfter) {
|
|
||||||
if (isNonAffixedPattern(endChars)) {
|
if (isNonAffixedPattern(endChars)) {
|
||||||
if (!charsMatch(word, breakPos - stemBefore.length, stemBefore)) {
|
if (!charsMatch(word, breakPos - rootBefore.word.length(), rootBefore.word)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else if (!charsMatch(word, breakPos - endChars.length(), endChars)) {
|
} else if (!charsMatch(word, breakPos - endChars.length(), endChars)) {
|
||||||
|
@ -62,18 +60,18 @@ class CheckCompoundPattern {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isNonAffixedPattern(beginChars)) {
|
if (isNonAffixedPattern(beginChars)) {
|
||||||
if (!charsMatch(word, breakPos, stemAfter)) {
|
if (!charsMatch(word, breakPos, rootAfter.word)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else if (!charsMatch(word, breakPos, beginChars)) {
|
} else if (!charsMatch(word, breakPos, beginChars)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (endFlags.length > 0 && !stemHasFlags(stemBefore, endFlags)) {
|
if (endFlags.length > 0 && !hasAllFlags(rootBefore, endFlags)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
//noinspection RedundantIfStatement
|
//noinspection RedundantIfStatement
|
||||||
if (beginFlags.length > 0 && !stemHasFlags(stemAfter, beginFlags)) {
|
if (beginFlags.length > 0 && !hasAllFlags(rootAfter, beginFlags)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -84,14 +82,9 @@ class CheckCompoundPattern {
|
||||||
return pattern.length() == 1 && pattern.charAt(0) == '0';
|
return pattern.length() == 1 && pattern.charAt(0) == '0';
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean stemHasFlags(CharsRef stem, char[] flags) {
|
private boolean hasAllFlags(Root<?> root, char[] flags) {
|
||||||
IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length);
|
|
||||||
return forms != null && hasAllFlags(flags, forms);
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean hasAllFlags(char[] flags, IntsRef forms) {
|
|
||||||
for (char flag : flags) {
|
for (char flag : flags) {
|
||||||
if (!dictionary.hasFlag(forms, flag)) {
|
if (!dictionary.hasFlag(root.entryId, flag)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -226,13 +226,13 @@ public class Hunspell {
|
||||||
int breakPos = prev.length;
|
int breakPos = prev.length;
|
||||||
int remainingLength = word.length - breakPos;
|
int remainingLength = word.length - breakPos;
|
||||||
int breakOffset = word.offset + breakPos;
|
int breakOffset = word.offset + breakPos;
|
||||||
Root<CharsRef> tailStem =
|
Root<CharsRef> lastRoot =
|
||||||
findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
|
findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
|
||||||
if (tailStem != null
|
if (lastRoot != null
|
||||||
&& !dictionary.hasFlag(tailStem.entryId, dictionary.forbiddenword)
|
&& !dictionary.hasFlag(lastRoot.entryId, dictionary.forbiddenword)
|
||||||
&& !(dictionary.checkCompoundDup && equalsIgnoreCase(prev.stem, tailStem.word))
|
&& !(dictionary.checkCompoundDup && prev.root.equals(lastRoot))
|
||||||
&& !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase)
|
&& !hasForceUCaseProblem(lastRoot, originalCase)
|
||||||
&& prev.mayCompound(tailStem, remainingLength, originalCase)) {
|
&& prev.mayCompound(lastRoot, remainingLength, originalCase)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -240,17 +240,9 @@ public class Hunspell {
|
||||||
return checkCompounds(tail, originalCase, prev);
|
return checkCompounds(tail, originalCase, prev);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean hasForceUCaseProblem(
|
private boolean hasForceUCaseProblem(Root<?> root, WordCase originalCase) {
|
||||||
char[] chars, int offset, int length, WordCase originalCase) {
|
|
||||||
if (dictionary.forceUCase == FLAG_UNSET) return false;
|
|
||||||
if (originalCase == WordCase.TITLE || originalCase == WordCase.UPPER) return false;
|
if (originalCase == WordCase.TITLE || originalCase == WordCase.UPPER) return false;
|
||||||
|
return dictionary.hasFlag(root.entryId, dictionary.forceUCase);
|
||||||
IntsRef forms = dictionary.lookupWord(chars, offset, length);
|
|
||||||
return forms != null && dictionary.hasFlag(forms, dictionary.forceUCase);
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean equalsIgnoreCase(CharSequence cr1, CharSequence cr2) {
|
|
||||||
return cr1.toString().equalsIgnoreCase(cr2.toString());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -274,19 +266,20 @@ public class Hunspell {
|
||||||
private class CompoundPart {
|
private class CompoundPart {
|
||||||
final CompoundPart prev;
|
final CompoundPart prev;
|
||||||
final int index, length;
|
final int index, length;
|
||||||
final CharsRef tail, stem;
|
final CharsRef tail;
|
||||||
|
final Root<CharsRef> root;
|
||||||
final CheckCompoundPattern enablingPattern;
|
final CheckCompoundPattern enablingPattern;
|
||||||
|
|
||||||
CompoundPart(
|
CompoundPart(
|
||||||
CompoundPart prev,
|
CompoundPart prev,
|
||||||
CharsRef tail,
|
CharsRef tail,
|
||||||
int length,
|
int length,
|
||||||
Root<CharsRef> stem,
|
Root<CharsRef> root,
|
||||||
CheckCompoundPattern enabler) {
|
CheckCompoundPattern enabler) {
|
||||||
this.prev = prev;
|
this.prev = prev;
|
||||||
this.tail = tail;
|
this.tail = tail;
|
||||||
this.length = length;
|
this.length = length;
|
||||||
this.stem = stem.word;
|
this.root = root;
|
||||||
index = prev == null ? 1 : prev.index + 1;
|
index = prev == null ? 1 : prev.index + 1;
|
||||||
enablingPattern = enabler;
|
enablingPattern = enabler;
|
||||||
}
|
}
|
||||||
|
@ -296,12 +289,12 @@ public class Hunspell {
|
||||||
return (prev == null ? "" : prev + "+") + tail.subSequence(0, length);
|
return (prev == null ? "" : prev + "+") + tail.subSequence(0, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean mayCompound(Root<CharsRef> nextStem, int nextPartLength, WordCase originalCase) {
|
boolean mayCompound(Root<CharsRef> nextRoot, int nextPartLength, WordCase originalCase) {
|
||||||
boolean patternsOk =
|
boolean patternsOk =
|
||||||
enablingPattern != null
|
enablingPattern != null
|
||||||
? enablingPattern.prohibitsCompounding(tail, length, stem, nextStem.word)
|
? enablingPattern.prohibitsCompounding(tail, length, root, nextRoot)
|
||||||
: dictionary.checkCompoundPatterns.stream()
|
: dictionary.checkCompoundPatterns.stream()
|
||||||
.noneMatch(p -> p.prohibitsCompounding(tail, length, stem, nextStem.word));
|
.noneMatch(p -> p.prohibitsCompounding(tail, length, root, nextRoot));
|
||||||
if (!patternsOk) {
|
if (!patternsOk) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue