mirror of https://github.com/apache/lucene.git
LUCENE-9806: Hunspell: speed up affix condition checking (#2423)
* LUCENE-9806: Hunspell: speed up affix condition checking check only stem beginning/end without strip/condition, not the whole candidate avoid regexp if possible * hunspell: simplify AffixCondition, add more tests * add a license to the test
This commit is contained in:
parent
e1ff4c1354
commit
3a99e2aa82
|
@ -0,0 +1,181 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.hunspell.AffixKind.PREFIX;
|
||||||
|
import static org.apache.lucene.analysis.hunspell.AffixKind.SUFFIX;
|
||||||
|
|
||||||
|
import java.util.regex.PatternSyntaxException;
|
||||||
|
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||||
|
import org.apache.lucene.util.automaton.RegExp;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks the "condition" part of affix definition, as in
|
||||||
|
*
|
||||||
|
* <pre>PFX flag stripping prefix [condition [morphological_fields...]]</pre>
|
||||||
|
*/
|
||||||
|
interface AffixCondition {
|
||||||
|
String ALWAYS_TRUE_KEY = ".*";
|
||||||
|
AffixCondition ALWAYS_TRUE = (word, offset, length) -> true;
|
||||||
|
AffixCondition ALWAYS_FALSE = (word, offset, length) -> false;
|
||||||
|
|
||||||
|
default boolean acceptsStem(String stem) {
|
||||||
|
return acceptsStem(stem.toCharArray(), 0, stem.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return whether the given word matches this condition as a stem with both "strip" and "affix"
|
||||||
|
* removed
|
||||||
|
*/
|
||||||
|
boolean acceptsStem(char[] word, int offset, int length);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return a key used to deduplicate same condition+strip+kind triples. For trivial conditions
|
||||||
|
* that need no check, {@link #ALWAYS_TRUE_KEY} is returned.
|
||||||
|
*/
|
||||||
|
static String uniqueKey(AffixKind kind, String strip, String condition) {
|
||||||
|
if (".".equals(condition)
|
||||||
|
|| kind == PREFIX && strip.startsWith(condition)
|
||||||
|
|| kind == SUFFIX && strip.endsWith(condition) && !isRegexp(condition)) {
|
||||||
|
return ALWAYS_TRUE_KEY;
|
||||||
|
}
|
||||||
|
return condition + " " + kind + " " + strip;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyzes the given affix kind, strip and condition and returns an object able to efficiently
|
||||||
|
* check that condition.
|
||||||
|
*/
|
||||||
|
static AffixCondition compile(AffixKind kind, String strip, String condition, String line) {
|
||||||
|
if (!isRegexp(condition)) {
|
||||||
|
if (kind == SUFFIX && condition.endsWith(strip)) {
|
||||||
|
return substringCondition(
|
||||||
|
kind, condition.substring(0, condition.length() - strip.length()));
|
||||||
|
}
|
||||||
|
if (kind == PREFIX && condition.startsWith(strip)) {
|
||||||
|
return substringCondition(kind, condition.substring(strip.length()));
|
||||||
|
}
|
||||||
|
return ALWAYS_FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
int lastBracket = condition.lastIndexOf('[');
|
||||||
|
if (lastBracket >= 0 && condition.indexOf(']', lastBracket + 1) < 0) {
|
||||||
|
// unclosed [ is tolerated by Hunspell and occurs in some dictionaries
|
||||||
|
condition = condition + "]";
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
int conditionChars = countCharPatterns(condition);
|
||||||
|
if (conditionChars <= strip.length()) {
|
||||||
|
String regex = kind == PREFIX ? ".*" + condition : condition + ".*";
|
||||||
|
return strip.matches(regex) ? ALWAYS_TRUE : ALWAYS_FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (kind == PREFIX) {
|
||||||
|
int split = skipCharPatterns(condition, strip.length());
|
||||||
|
if (!strip.matches(condition.substring(0, split))) {
|
||||||
|
return ALWAYS_FALSE;
|
||||||
|
}
|
||||||
|
return regexpCondition(kind, condition.substring(split), conditionChars - strip.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
int split = skipCharPatterns(condition, conditionChars - strip.length());
|
||||||
|
if (!strip.matches(condition.substring(split))) {
|
||||||
|
return ALWAYS_FALSE;
|
||||||
|
}
|
||||||
|
return regexpCondition(kind, condition.substring(0, split), conditionChars - strip.length());
|
||||||
|
} catch (PatternSyntaxException e) {
|
||||||
|
return ALWAYS_FALSE;
|
||||||
|
} catch (Throwable e) {
|
||||||
|
throw new IllegalArgumentException("On line: " + line, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int skipCharPatterns(String condition, int count) {
|
||||||
|
int pos = 0;
|
||||||
|
for (int i = 0; i < count; i++) pos = skipCharPattern(condition, pos);
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int countCharPatterns(String condition) {
|
||||||
|
int conditionChars = 0;
|
||||||
|
for (int i = 0; i < condition.length(); i = skipCharPattern(condition, i)) conditionChars++;
|
||||||
|
return conditionChars;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int skipCharPattern(String condition, int pos) {
|
||||||
|
if (condition.charAt(pos) == '[') {
|
||||||
|
pos = condition.indexOf(']', pos + 1);
|
||||||
|
if (pos < 0) {
|
||||||
|
throw new AssertionError("Malformed condition " + condition);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return pos + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isRegexp(String condition) {
|
||||||
|
return condition.contains("[") || condition.contains(".") || condition.contains("-");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static AffixCondition substringCondition(AffixKind kind, String stemCondition) {
|
||||||
|
boolean forSuffix = kind == AffixKind.SUFFIX;
|
||||||
|
int condLength = stemCondition.length();
|
||||||
|
return (word, offset, length) -> {
|
||||||
|
if (length < condLength) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
int matchStart = forSuffix ? offset + length - condLength : offset;
|
||||||
|
for (int i = 0; i < condLength; i++) {
|
||||||
|
if (stemCondition.charAt(i) != word[matchStart + i]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private static AffixCondition regexpCondition(AffixKind kind, String condition, int charCount) {
|
||||||
|
boolean forSuffix = kind == AffixKind.SUFFIX;
|
||||||
|
CharacterRunAutomaton automaton =
|
||||||
|
new CharacterRunAutomaton(new RegExp(escapeDash(condition), RegExp.NONE).toAutomaton());
|
||||||
|
return (word, offset, length) ->
|
||||||
|
length >= charCount
|
||||||
|
&& automaton.run(word, forSuffix ? offset + length - charCount : offset, charCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
// "dash hasn't got special meaning" (we must escape it)
|
||||||
|
private static String escapeDash(String re) {
|
||||||
|
if (!re.contains("-")) return re;
|
||||||
|
|
||||||
|
// we have to be careful, even though dash doesn't have a special meaning,
|
||||||
|
// some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it
|
||||||
|
StringBuilder escaped = new StringBuilder();
|
||||||
|
for (int i = 0; i < re.length(); i++) {
|
||||||
|
char c = re.charAt(i);
|
||||||
|
if (c == '-') {
|
||||||
|
escaped.append("\\-");
|
||||||
|
} else {
|
||||||
|
escaped.append(c);
|
||||||
|
if (c == '\\' && i + 1 < re.length()) {
|
||||||
|
escaped.append(re.charAt(i + 1));
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return escaped.toString();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
enum AffixKind {
|
||||||
|
PREFIX,
|
||||||
|
SUFFIX
|
||||||
|
}
|
|
@ -16,6 +16,8 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.analysis.hunspell;
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.hunspell.AffixKind.*;
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
import java.io.BufferedInputStream;
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
|
@ -59,8 +61,6 @@ import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.OfflineSorter;
|
import org.apache.lucene.util.OfflineSorter;
|
||||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
|
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
|
||||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
||||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
|
||||||
import org.apache.lucene.util.automaton.RegExp;
|
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.FSTCompiler;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.IntSequenceOutputs;
|
import org.apache.lucene.util.fst.IntSequenceOutputs;
|
||||||
|
@ -89,7 +89,7 @@ public class Dictionary {
|
||||||
* All condition checks used by prefixes and suffixes. these are typically re-used across many
|
* All condition checks used by prefixes and suffixes. these are typically re-used across many
|
||||||
* affix stripping rules. so these are deduplicated, to save RAM.
|
* affix stripping rules. so these are deduplicated, to save RAM.
|
||||||
*/
|
*/
|
||||||
ArrayList<CharacterRunAutomaton> patterns = new ArrayList<>();
|
ArrayList<AffixCondition> patterns = new ArrayList<>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The entries in the .dic file, mapping to their set of flags. the fst output is the ordinal list
|
* The entries in the .dic file, mapping to their set of flags. the fst output is the ordinal list
|
||||||
|
@ -338,7 +338,7 @@ public class Dictionary {
|
||||||
Map<String, Integer> seenPatterns = new HashMap<>();
|
Map<String, Integer> seenPatterns = new HashMap<>();
|
||||||
|
|
||||||
// zero condition -> 0 ord
|
// zero condition -> 0 ord
|
||||||
seenPatterns.put(".*", 0);
|
seenPatterns.put(AffixCondition.ALWAYS_TRUE_KEY, 0);
|
||||||
patterns.add(null);
|
patterns.add(null);
|
||||||
|
|
||||||
// zero strip -> 0 ord
|
// zero strip -> 0 ord
|
||||||
|
@ -362,9 +362,11 @@ public class Dictionary {
|
||||||
} else if ("AM".equals(firstWord)) {
|
} else if ("AM".equals(firstWord)) {
|
||||||
parseMorphAlias(line);
|
parseMorphAlias(line);
|
||||||
} else if ("PFX".equals(firstWord)) {
|
} else if ("PFX".equals(firstWord)) {
|
||||||
parseAffix(prefixes, prefixContFlags, line, reader, false, seenPatterns, seenStrips, flags);
|
parseAffix(
|
||||||
|
prefixes, prefixContFlags, line, reader, PREFIX, seenPatterns, seenStrips, flags);
|
||||||
} else if ("SFX".equals(firstWord)) {
|
} else if ("SFX".equals(firstWord)) {
|
||||||
parseAffix(suffixes, suffixContFlags, line, reader, true, seenPatterns, seenStrips, flags);
|
parseAffix(
|
||||||
|
suffixes, suffixContFlags, line, reader, SUFFIX, seenPatterns, seenStrips, flags);
|
||||||
} else if (line.equals("COMPLEXPREFIXES")) {
|
} else if (line.equals("COMPLEXPREFIXES")) {
|
||||||
complexPrefixes =
|
complexPrefixes =
|
||||||
true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
|
true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
|
||||||
|
@ -655,25 +657,6 @@ public class Dictionary {
|
||||||
return fstCompiler.compile();
|
return fstCompiler.compile();
|
||||||
}
|
}
|
||||||
|
|
||||||
static String escapeDash(String re) {
|
|
||||||
// we have to be careful, even though dash doesn't have a special meaning,
|
|
||||||
// some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it
|
|
||||||
StringBuilder escaped = new StringBuilder();
|
|
||||||
for (int i = 0; i < re.length(); i++) {
|
|
||||||
char c = re.charAt(i);
|
|
||||||
if (c == '-') {
|
|
||||||
escaped.append("\\-");
|
|
||||||
} else {
|
|
||||||
escaped.append(c);
|
|
||||||
if (c == '\\' && i + 1 < re.length()) {
|
|
||||||
escaped.append(re.charAt(i + 1));
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return escaped.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parses a specific affix rule putting the result into the provided affix map
|
* Parses a specific affix rule putting the result into the provided affix map
|
||||||
*
|
*
|
||||||
|
@ -688,7 +671,7 @@ public class Dictionary {
|
||||||
Set<Character> secondStageFlags,
|
Set<Character> secondStageFlags,
|
||||||
String header,
|
String header,
|
||||||
LineNumberReader reader,
|
LineNumberReader reader,
|
||||||
boolean isSuffix,
|
AffixKind kind,
|
||||||
Map<String, Integer> seenPatterns,
|
Map<String, Integer> seenPatterns,
|
||||||
Map<String, Integer> seenStrips,
|
Map<String, Integer> seenStrips,
|
||||||
FlagEnumerator flags)
|
FlagEnumerator flags)
|
||||||
|
@ -738,41 +721,18 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
|
|
||||||
String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
|
String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
|
||||||
// at least the gascon affix file has this issue
|
String key = AffixCondition.uniqueKey(kind, strip, condition);
|
||||||
if (condition.startsWith("[") && condition.indexOf(']') == -1) {
|
|
||||||
condition = condition + "]";
|
|
||||||
}
|
|
||||||
// "dash hasn't got special meaning" (we must escape it)
|
|
||||||
if (condition.indexOf('-') >= 0) {
|
|
||||||
condition = escapeDash(condition);
|
|
||||||
}
|
|
||||||
|
|
||||||
final String regex;
|
|
||||||
if (".".equals(condition)) {
|
|
||||||
regex = ".*"; // Zero condition is indicated by dot
|
|
||||||
} else if (condition.equals(strip)) {
|
|
||||||
regex = ".*"; // TODO: optimize this better:
|
|
||||||
// if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
|
|
||||||
// but this is complicated...
|
|
||||||
} else {
|
|
||||||
// TODO: really for suffixes we should reverse the automaton and run them backwards
|
|
||||||
regex = isSuffix ? ".*" + condition : condition + ".*";
|
|
||||||
}
|
|
||||||
|
|
||||||
// deduplicate patterns
|
// deduplicate patterns
|
||||||
Integer patternIndex = seenPatterns.get(regex);
|
Integer patternIndex = seenPatterns.get(key);
|
||||||
if (patternIndex == null) {
|
if (patternIndex == null) {
|
||||||
patternIndex = patterns.size();
|
patternIndex = patterns.size();
|
||||||
if (patternIndex > Short.MAX_VALUE) {
|
if (patternIndex > Short.MAX_VALUE) {
|
||||||
throw new UnsupportedOperationException(
|
throw new UnsupportedOperationException(
|
||||||
"Too many patterns, please report this to dev@lucene.apache.org");
|
"Too many patterns, please report this to dev@lucene.apache.org");
|
||||||
}
|
}
|
||||||
seenPatterns.put(regex, patternIndex);
|
seenPatterns.put(key, patternIndex);
|
||||||
try {
|
patterns.add(AffixCondition.compile(kind, strip, condition, line));
|
||||||
patterns.add(new CharacterRunAutomaton(conditionRegexp(regex).toAutomaton()));
|
|
||||||
} catch (IllegalArgumentException e) {
|
|
||||||
throw new IllegalArgumentException("On line " + reader.getLineNumber() + ": " + line, e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Integer stripOrd = seenStrips.get(strip);
|
Integer stripOrd = seenStrips.get(strip);
|
||||||
|
@ -811,7 +771,7 @@ public class Dictionary {
|
||||||
affixArg = cleanInput(affixArg, sb).toString();
|
affixArg = cleanInput(affixArg, sb).toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isSuffix) {
|
if (kind == SUFFIX) {
|
||||||
affixArg = new StringBuilder(affixArg).reverse().toString();
|
affixArg = new StringBuilder(affixArg).reverse().toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -820,17 +780,6 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static RegExp conditionRegexp(String regex) {
|
|
||||||
try {
|
|
||||||
return new RegExp(regex, RegExp.NONE);
|
|
||||||
} catch (IllegalArgumentException e) {
|
|
||||||
if (e.getMessage().contains("expected ']'")) {
|
|
||||||
return conditionRegexp(regex + "]");
|
|
||||||
}
|
|
||||||
throw e;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
char affixData(int affixIndex, int offset) {
|
char affixData(int affixIndex, int offset) {
|
||||||
return affixData[affixIndex * 4 + offset];
|
return affixData[affixIndex * 4 + offset];
|
||||||
}
|
}
|
||||||
|
|
|
@ -269,7 +269,7 @@ class GeneratingSuggester {
|
||||||
|
|
||||||
private boolean checkAffixCondition(int suffixId, String stem) {
|
private boolean checkAffixCondition(int suffixId, String stem) {
|
||||||
int condition = dictionary.getAffixCondition(suffixId);
|
int condition = dictionary.getAffixCondition(suffixId);
|
||||||
return condition == 0 || dictionary.patterns.get(condition).run(stem);
|
return condition == 0 || dictionary.patterns.get(condition).acceptsStem(stem);
|
||||||
}
|
}
|
||||||
|
|
||||||
private int affixStripLength(int affixId) {
|
private int affixStripLength(int affixId) {
|
||||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.CharsRef;
|
import org.apache.lucene.util.CharsRef;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -486,14 +485,13 @@ final class Stemmer {
|
||||||
int stripLen = stripEnd - stripStart;
|
int stripLen = stripEnd - stripStart;
|
||||||
|
|
||||||
char[] stripData = dictionary.stripData;
|
char[] stripData = dictionary.stripData;
|
||||||
boolean condition =
|
int condition = dictionary.getAffixCondition(affix);
|
||||||
isPrefix
|
if (condition != 0) {
|
||||||
? checkCondition(
|
int deAffixedOffset = isPrefix ? offset + affixLen : offset;
|
||||||
affix, stripData, stripStart, stripLen, word, offset + affixLen, deAffixedLen)
|
if (!dictionary.patterns.get(condition).acceptsStem(word, deAffixedOffset, deAffixedLen)) {
|
||||||
: checkCondition(affix, word, offset, deAffixedLen, stripData, stripStart, stripLen);
|
|
||||||
if (!condition) {
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (stripLen == 0) return word;
|
if (stripLen == 0) return word;
|
||||||
|
|
||||||
|
@ -547,33 +545,6 @@ final class Stemmer {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** checks condition of the concatenation of two strings */
|
|
||||||
// note: this is pretty stupid, we really should subtract strip from the condition up front and
|
|
||||||
// just check the stem
|
|
||||||
// but this is a little bit more complicated.
|
|
||||||
private boolean checkCondition(
|
|
||||||
int affix, char[] c1, int c1off, int c1len, char[] c2, int c2off, int c2len) {
|
|
||||||
int condition = dictionary.getAffixCondition(affix);
|
|
||||||
if (condition != 0) {
|
|
||||||
CharacterRunAutomaton pattern = dictionary.patterns.get(condition);
|
|
||||||
int state = 0;
|
|
||||||
for (int i = c1off; i < c1off + c1len; i++) {
|
|
||||||
state = pattern.step(state, c1[i]);
|
|
||||||
if (state == -1) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (int i = c2off; i < c2off + c2len; i++) {
|
|
||||||
state = pattern.step(state, c2[i]);
|
|
||||||
if (state == -1) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return pattern.isAccept(state);
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Applies the affix rule to the given word, producing a list of stems if any are found
|
* Applies the affix rule to the given word, producing a list of stems if any are found
|
||||||
*
|
*
|
||||||
|
|
|
@ -0,0 +1,76 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.hunspell.AffixCondition.ALWAYS_FALSE;
|
||||||
|
import static org.apache.lucene.analysis.hunspell.AffixCondition.ALWAYS_TRUE_KEY;
|
||||||
|
import static org.apache.lucene.analysis.hunspell.AffixKind.PREFIX;
|
||||||
|
import static org.apache.lucene.analysis.hunspell.AffixKind.SUFFIX;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
public class TestAffixCondition extends LuceneTestCase {
|
||||||
|
|
||||||
|
public void testPlainSuffixMatching() {
|
||||||
|
AffixCondition condition = AffixCondition.compile(SUFFIX, "b", "ab", "");
|
||||||
|
assertTrue(condition.acceptsStem("a"));
|
||||||
|
assertFalse(condition.acceptsStem("b"));
|
||||||
|
assertFalse(condition.acceptsStem("ab"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testPlainPrefixMatching() {
|
||||||
|
AffixCondition condition = AffixCondition.compile(PREFIX, "a", "ab", "");
|
||||||
|
assertFalse(condition.acceptsStem("ab"));
|
||||||
|
assertTrue(condition.acceptsStem("b"));
|
||||||
|
assertFalse(condition.acceptsStem("a"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDotMatching() {
|
||||||
|
AffixCondition condition = AffixCondition.compile(PREFIX, "", "wr.", "");
|
||||||
|
assertTrue(condition.acceptsStem("wry"));
|
||||||
|
assertTrue(condition.acceptsStem("wrong"));
|
||||||
|
assertFalse(condition.acceptsStem("white"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testUniqueKey() {
|
||||||
|
assertNotEquals(
|
||||||
|
AffixCondition.uniqueKey(PREFIX, "", "x"), AffixCondition.uniqueKey(SUFFIX, "", "x"));
|
||||||
|
assertNotEquals(
|
||||||
|
AffixCondition.uniqueKey(SUFFIX, "y", "x"), AffixCondition.uniqueKey(SUFFIX, "", "x"));
|
||||||
|
assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(PREFIX, "", "."));
|
||||||
|
assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(SUFFIX, "abc", "abc"));
|
||||||
|
|
||||||
|
assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(SUFFIX, "abc", "bc"));
|
||||||
|
assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(PREFIX, "abc", "ab"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testConditionHasBracketsIntersectingWithStrip() {
|
||||||
|
assertTrue(AffixCondition.compile(SUFFIX, "oj", "[io]j", "").acceptsStem("whatever"));
|
||||||
|
assertTrue(AffixCondition.compile(SUFFIX, "oj", "o[ioj", "").acceptsStem("whatever"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testImpossibleCondition() {
|
||||||
|
assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "a", "b", ""));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNonHunspellPatternCharacters() {
|
||||||
|
assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "(^ax)", ""));
|
||||||
|
assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "(^.x)", ""));
|
||||||
|
assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "[z](^ax)", ""));
|
||||||
|
assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "(^ax)[z]", ""));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue