mirror of
https://github.com/apache/lucene.git
synced 2025-02-28 21:39:25 +00:00
LUCENE-9803: Hunspell: don't check second stage suffixes if the first stage flag only occurs in prefixes (#2418)
This commit is contained in:
parent
7d3f3d61ce
commit
342ea856d3
@ -149,7 +149,7 @@ public class Dictionary {
|
||||
* All flags used in affix continuation classes. If an outer affix's flag isn't here, there's no
|
||||
* need to do 2-level affix stripping with it.
|
||||
*/
|
||||
private char[] secondStageAffixFlags;
|
||||
private char[] secondStagePrefixFlags, secondStageSuffixFlags;
|
||||
|
||||
char circumfix;
|
||||
char keepcase, forceUCase;
|
||||
@ -333,7 +333,8 @@ public class Dictionary {
|
||||
throws IOException, ParseException {
|
||||
TreeMap<String, List<Integer>> prefixes = new TreeMap<>();
|
||||
TreeMap<String, List<Integer>> suffixes = new TreeMap<>();
|
||||
Set<Character> stage2Flags = new HashSet<>();
|
||||
Set<Character> prefixContFlags = new HashSet<>();
|
||||
Set<Character> suffixContFlags = new HashSet<>();
|
||||
Map<String, Integer> seenPatterns = new HashMap<>();
|
||||
|
||||
// zero condition -> 0 ord
|
||||
@ -361,9 +362,9 @@ public class Dictionary {
|
||||
} else if ("AM".equals(firstWord)) {
|
||||
parseMorphAlias(line);
|
||||
} else if ("PFX".equals(firstWord)) {
|
||||
parseAffix(prefixes, stage2Flags, line, reader, false, seenPatterns, seenStrips, flags);
|
||||
parseAffix(prefixes, prefixContFlags, line, reader, false, seenPatterns, seenStrips, flags);
|
||||
} else if ("SFX".equals(firstWord)) {
|
||||
parseAffix(suffixes, stage2Flags, line, reader, true, seenPatterns, seenStrips, flags);
|
||||
parseAffix(suffixes, suffixContFlags, line, reader, true, seenPatterns, seenStrips, flags);
|
||||
} else if (line.equals("COMPLEXPREFIXES")) {
|
||||
complexPrefixes =
|
||||
true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
|
||||
@ -478,7 +479,8 @@ public class Dictionary {
|
||||
|
||||
this.prefixes = affixFST(prefixes);
|
||||
this.suffixes = affixFST(suffixes);
|
||||
secondStageAffixFlags = toSortedCharArray(stage2Flags);
|
||||
secondStagePrefixFlags = toSortedCharArray(prefixContFlags);
|
||||
secondStageSuffixFlags = toSortedCharArray(suffixContFlags);
|
||||
|
||||
int totalChars = 0;
|
||||
for (String strip : seenStrips.keySet()) {
|
||||
@ -1624,8 +1626,12 @@ public class Dictionary {
|
||||
return chars;
|
||||
}
|
||||
|
||||
boolean isSecondStageAffix(char flag) {
|
||||
return Arrays.binarySearch(secondStageAffixFlags, flag) >= 0;
|
||||
boolean isSecondStagePrefix(char flag) {
|
||||
return Arrays.binarySearch(secondStagePrefixFlags, flag) >= 0;
|
||||
}
|
||||
|
||||
boolean isSecondStageSuffix(char flag) {
|
||||
return Arrays.binarySearch(secondStageSuffixFlags, flag) >= 0;
|
||||
}
|
||||
|
||||
/** folds single character (according to LANG if present) */
|
||||
|
@ -635,11 +635,11 @@ final class Stemmer {
|
||||
if (recursionDepth == 0) {
|
||||
if (prefix) {
|
||||
prefixId = affix;
|
||||
doPrefix = dictionary.complexPrefixes && dictionary.isSecondStageAffix(flag);
|
||||
doPrefix = dictionary.complexPrefixes && dictionary.isSecondStagePrefix(flag);
|
||||
// we took away the first prefix.
|
||||
// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
|
||||
// COMPLEXPREFIXES = false: combine with a suffix
|
||||
} else if (!dictionary.complexPrefixes && dictionary.isSecondStageAffix(flag)) {
|
||||
} else if (!dictionary.complexPrefixes && dictionary.isSecondStageSuffix(flag)) {
|
||||
doPrefix = false;
|
||||
// we took away a suffix.
|
||||
// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
|
||||
@ -652,7 +652,7 @@ final class Stemmer {
|
||||
if (prefix && dictionary.complexPrefixes) {
|
||||
prefixId = affix;
|
||||
// we took away the second prefix: go look for another suffix
|
||||
} else if (prefix || dictionary.complexPrefixes || !dictionary.isSecondStageAffix(flag)) {
|
||||
} else if (prefix || dictionary.complexPrefixes || !dictionary.isSecondStageSuffix(flag)) {
|
||||
return true;
|
||||
}
|
||||
// we took away a prefix, then a suffix: go look for another suffix
|
||||
|
@ -77,7 +77,7 @@ public class TestPerformance extends LuceneTestCase {
|
||||
|
||||
@Test
|
||||
public void fr() throws Exception {
|
||||
checkAnalysisPerformance("fr", 80_000);
|
||||
checkAnalysisPerformance("fr", 100_000);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
Loading…
x
Reference in New Issue
Block a user