diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java
new file mode 100644
index 00000000000..be0d8ba2865
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import static org.apache.lucene.analysis.hunspell.AffixKind.PREFIX;
+import static org.apache.lucene.analysis.hunspell.AffixKind.SUFFIX;
+
+import java.util.regex.PatternSyntaxException;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.RegExp;
+
+/**
+ * Checks the "condition" part of affix definition, as in
+ *
+ *
PFX flag stripping prefix [condition [morphological_fields...]]
+ */
+interface AffixCondition {
+ String ALWAYS_TRUE_KEY = ".*";
+ AffixCondition ALWAYS_TRUE = (word, offset, length) -> true;
+ AffixCondition ALWAYS_FALSE = (word, offset, length) -> false;
+
+ default boolean acceptsStem(String stem) {
+ return acceptsStem(stem.toCharArray(), 0, stem.length());
+ }
+
+ /**
+ * @return whether the given word matches this condition as a stem with both "strip" and "affix"
+ * removed
+ */
+ boolean acceptsStem(char[] word, int offset, int length);
+
+ /**
+ * @return a key used to deduplicate same condition+strip+kind triples. For trivial conditions
+ * that need no check, {@link #ALWAYS_TRUE_KEY} is returned.
+ */
+ static String uniqueKey(AffixKind kind, String strip, String condition) {
+ if (".".equals(condition)
+ || kind == PREFIX && strip.startsWith(condition)
+ || kind == SUFFIX && strip.endsWith(condition) && !isRegexp(condition)) {
+ return ALWAYS_TRUE_KEY;
+ }
+ return condition + " " + kind + " " + strip;
+ }
+
+ /**
+ * Analyzes the given affix kind, strip and condition and returns an object able to efficiently
+ * check that condition.
+ */
+ static AffixCondition compile(AffixKind kind, String strip, String condition, String line) {
+ if (!isRegexp(condition)) {
+ if (kind == SUFFIX && condition.endsWith(strip)) {
+ return substringCondition(
+ kind, condition.substring(0, condition.length() - strip.length()));
+ }
+ if (kind == PREFIX && condition.startsWith(strip)) {
+ return substringCondition(kind, condition.substring(strip.length()));
+ }
+ return ALWAYS_FALSE;
+ }
+
+ int lastBracket = condition.lastIndexOf('[');
+ if (lastBracket >= 0 && condition.indexOf(']', lastBracket + 1) < 0) {
+ // unclosed [ is tolerated by Hunspell and occurs in some dictionaries
+ condition = condition + "]";
+ }
+
+ try {
+ int conditionChars = countCharPatterns(condition);
+ if (conditionChars <= strip.length()) {
+ String regex = kind == PREFIX ? ".*" + condition : condition + ".*";
+ return strip.matches(regex) ? ALWAYS_TRUE : ALWAYS_FALSE;
+ }
+
+ if (kind == PREFIX) {
+ int split = skipCharPatterns(condition, strip.length());
+ if (!strip.matches(condition.substring(0, split))) {
+ return ALWAYS_FALSE;
+ }
+ return regexpCondition(kind, condition.substring(split), conditionChars - strip.length());
+ }
+
+ int split = skipCharPatterns(condition, conditionChars - strip.length());
+ if (!strip.matches(condition.substring(split))) {
+ return ALWAYS_FALSE;
+ }
+ return regexpCondition(kind, condition.substring(0, split), conditionChars - strip.length());
+ } catch (PatternSyntaxException e) {
+ return ALWAYS_FALSE;
+ } catch (Throwable e) {
+ throw new IllegalArgumentException("On line: " + line, e);
+ }
+ }
+
+ private static int skipCharPatterns(String condition, int count) {
+ int pos = 0;
+ for (int i = 0; i < count; i++) pos = skipCharPattern(condition, pos);
+ return pos;
+ }
+
+ private static int countCharPatterns(String condition) {
+ int conditionChars = 0;
+ for (int i = 0; i < condition.length(); i = skipCharPattern(condition, i)) conditionChars++;
+ return conditionChars;
+ }
+
+ private static int skipCharPattern(String condition, int pos) {
+ if (condition.charAt(pos) == '[') {
+ pos = condition.indexOf(']', pos + 1);
+ if (pos < 0) {
+ throw new AssertionError("Malformed condition " + condition);
+ }
+ }
+ return pos + 1;
+ }
+
+ private static boolean isRegexp(String condition) {
+ return condition.contains("[") || condition.contains(".") || condition.contains("-");
+ }
+
+ private static AffixCondition substringCondition(AffixKind kind, String stemCondition) {
+ boolean forSuffix = kind == AffixKind.SUFFIX;
+ int condLength = stemCondition.length();
+ return (word, offset, length) -> {
+ if (length < condLength) {
+ return false;
+ }
+ int matchStart = forSuffix ? offset + length - condLength : offset;
+ for (int i = 0; i < condLength; i++) {
+ if (stemCondition.charAt(i) != word[matchStart + i]) {
+ return false;
+ }
+ }
+ return true;
+ };
+ }
+
+ private static AffixCondition regexpCondition(AffixKind kind, String condition, int charCount) {
+ boolean forSuffix = kind == AffixKind.SUFFIX;
+ CharacterRunAutomaton automaton =
+ new CharacterRunAutomaton(new RegExp(escapeDash(condition), RegExp.NONE).toAutomaton());
+ return (word, offset, length) ->
+ length >= charCount
+ && automaton.run(word, forSuffix ? offset + length - charCount : offset, charCount);
+ }
+
+ // "dash hasn't got special meaning" (we must escape it)
+ private static String escapeDash(String re) {
+ if (!re.contains("-")) return re;
+
+ // we have to be careful, even though dash doesn't have a special meaning,
+ // some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it
+ StringBuilder escaped = new StringBuilder();
+ for (int i = 0; i < re.length(); i++) {
+ char c = re.charAt(i);
+ if (c == '-') {
+ escaped.append("\\-");
+ } else {
+ escaped.append(c);
+ if (c == '\\' && i + 1 < re.length()) {
+ escaped.append(re.charAt(i + 1));
+ i++;
+ }
+ }
+ }
+ return escaped.toString();
+ }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixKind.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixKind.java
new file mode 100644
index 00000000000..bd41673b5ab
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixKind.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+enum AffixKind {
+ PREFIX,
+ SUFFIX
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 0547c3e4ebc..bea4cf97dc4 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -16,6 +16,8 @@
*/
package org.apache.lucene.analysis.hunspell;
+import static org.apache.lucene.analysis.hunspell.AffixKind.*;
+
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
@@ -59,8 +61,6 @@ import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;
-import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.IntSequenceOutputs;
@@ -89,7 +89,7 @@ public class Dictionary {
* All condition checks used by prefixes and suffixes. these are typically re-used across many
* affix stripping rules. so these are deduplicated, to save RAM.
*/
- ArrayList patterns = new ArrayList<>();
+ ArrayList patterns = new ArrayList<>();
/**
* The entries in the .dic file, mapping to their set of flags. the fst output is the ordinal list
@@ -338,7 +338,7 @@ public class Dictionary {
Map seenPatterns = new HashMap<>();
// zero condition -> 0 ord
- seenPatterns.put(".*", 0);
+ seenPatterns.put(AffixCondition.ALWAYS_TRUE_KEY, 0);
patterns.add(null);
// zero strip -> 0 ord
@@ -362,9 +362,11 @@ public class Dictionary {
} else if ("AM".equals(firstWord)) {
parseMorphAlias(line);
} else if ("PFX".equals(firstWord)) {
- parseAffix(prefixes, prefixContFlags, line, reader, false, seenPatterns, seenStrips, flags);
+ parseAffix(
+ prefixes, prefixContFlags, line, reader, PREFIX, seenPatterns, seenStrips, flags);
} else if ("SFX".equals(firstWord)) {
- parseAffix(suffixes, suffixContFlags, line, reader, true, seenPatterns, seenStrips, flags);
+ parseAffix(
+ suffixes, suffixContFlags, line, reader, SUFFIX, seenPatterns, seenStrips, flags);
} else if (line.equals("COMPLEXPREFIXES")) {
complexPrefixes =
true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
@@ -655,25 +657,6 @@ public class Dictionary {
return fstCompiler.compile();
}
- static String escapeDash(String re) {
- // we have to be careful, even though dash doesn't have a special meaning,
- // some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it
- StringBuilder escaped = new StringBuilder();
- for (int i = 0; i < re.length(); i++) {
- char c = re.charAt(i);
- if (c == '-') {
- escaped.append("\\-");
- } else {
- escaped.append(c);
- if (c == '\\' && i + 1 < re.length()) {
- escaped.append(re.charAt(i + 1));
- i++;
- }
- }
- }
- return escaped.toString();
- }
-
/**
* Parses a specific affix rule putting the result into the provided affix map
*
@@ -688,7 +671,7 @@ public class Dictionary {
Set secondStageFlags,
String header,
LineNumberReader reader,
- boolean isSuffix,
+ AffixKind kind,
Map seenPatterns,
Map seenStrips,
FlagEnumerator flags)
@@ -738,41 +721,18 @@ public class Dictionary {
}
String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
- // at least the gascon affix file has this issue
- if (condition.startsWith("[") && condition.indexOf(']') == -1) {
- condition = condition + "]";
- }
- // "dash hasn't got special meaning" (we must escape it)
- if (condition.indexOf('-') >= 0) {
- condition = escapeDash(condition);
- }
-
- final String regex;
- if (".".equals(condition)) {
- regex = ".*"; // Zero condition is indicated by dot
- } else if (condition.equals(strip)) {
- regex = ".*"; // TODO: optimize this better:
- // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
- // but this is complicated...
- } else {
- // TODO: really for suffixes we should reverse the automaton and run them backwards
- regex = isSuffix ? ".*" + condition : condition + ".*";
- }
+ String key = AffixCondition.uniqueKey(kind, strip, condition);
// deduplicate patterns
- Integer patternIndex = seenPatterns.get(regex);
+ Integer patternIndex = seenPatterns.get(key);
if (patternIndex == null) {
patternIndex = patterns.size();
if (patternIndex > Short.MAX_VALUE) {
throw new UnsupportedOperationException(
"Too many patterns, please report this to dev@lucene.apache.org");
}
- seenPatterns.put(regex, patternIndex);
- try {
- patterns.add(new CharacterRunAutomaton(conditionRegexp(regex).toAutomaton()));
- } catch (IllegalArgumentException e) {
- throw new IllegalArgumentException("On line " + reader.getLineNumber() + ": " + line, e);
- }
+ seenPatterns.put(key, patternIndex);
+ patterns.add(AffixCondition.compile(kind, strip, condition, line));
}
Integer stripOrd = seenStrips.get(strip);
@@ -811,7 +771,7 @@ public class Dictionary {
affixArg = cleanInput(affixArg, sb).toString();
}
- if (isSuffix) {
+ if (kind == SUFFIX) {
affixArg = new StringBuilder(affixArg).reverse().toString();
}
@@ -820,17 +780,6 @@ public class Dictionary {
}
}
- private static RegExp conditionRegexp(String regex) {
- try {
- return new RegExp(regex, RegExp.NONE);
- } catch (IllegalArgumentException e) {
- if (e.getMessage().contains("expected ']'")) {
- return conditionRegexp(regex + "]");
- }
- throw e;
- }
- }
-
char affixData(int affixIndex, int offset) {
return affixData[affixIndex * 4 + offset];
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
index cb5286132fc..90304594ad6 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@@ -269,7 +269,7 @@ class GeneratingSuggester {
private boolean checkAffixCondition(int suffixId, String stem) {
int condition = dictionary.getAffixCondition(suffixId);
- return condition == 0 || dictionary.patterns.get(condition).run(stem);
+ return condition == 0 || dictionary.patterns.get(condition).acceptsStem(stem);
}
private int affixStripLength(int affixId) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
index 453211bc213..f864dee2db3 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -24,7 +24,6 @@ import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.fst.FST;
/**
@@ -486,13 +485,12 @@ final class Stemmer {
int stripLen = stripEnd - stripStart;
char[] stripData = dictionary.stripData;
- boolean condition =
- isPrefix
- ? checkCondition(
- affix, stripData, stripStart, stripLen, word, offset + affixLen, deAffixedLen)
- : checkCondition(affix, word, offset, deAffixedLen, stripData, stripStart, stripLen);
- if (!condition) {
- return null;
+ int condition = dictionary.getAffixCondition(affix);
+ if (condition != 0) {
+ int deAffixedOffset = isPrefix ? offset + affixLen : offset;
+ if (!dictionary.patterns.get(condition).acceptsStem(word, deAffixedOffset, deAffixedLen)) {
+ return null;
+ }
}
if (stripLen == 0) return word;
@@ -547,33 +545,6 @@ final class Stemmer {
return false;
}
- /** checks condition of the concatenation of two strings */
- // note: this is pretty stupid, we really should subtract strip from the condition up front and
- // just check the stem
- // but this is a little bit more complicated.
- private boolean checkCondition(
- int affix, char[] c1, int c1off, int c1len, char[] c2, int c2off, int c2len) {
- int condition = dictionary.getAffixCondition(affix);
- if (condition != 0) {
- CharacterRunAutomaton pattern = dictionary.patterns.get(condition);
- int state = 0;
- for (int i = c1off; i < c1off + c1len; i++) {
- state = pattern.step(state, c1[i]);
- if (state == -1) {
- return false;
- }
- }
- for (int i = c2off; i < c2off + c2len; i++) {
- state = pattern.step(state, c2[i]);
- if (state == -1) {
- return false;
- }
- }
- return pattern.isAccept(state);
- }
- return true;
- }
-
/**
* Applies the affix rule to the given word, producing a list of stems if any are found
*
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAffixCondition.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAffixCondition.java
new file mode 100644
index 00000000000..c419040f92e
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAffixCondition.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import static org.apache.lucene.analysis.hunspell.AffixCondition.ALWAYS_FALSE;
+import static org.apache.lucene.analysis.hunspell.AffixCondition.ALWAYS_TRUE_KEY;
+import static org.apache.lucene.analysis.hunspell.AffixKind.PREFIX;
+import static org.apache.lucene.analysis.hunspell.AffixKind.SUFFIX;
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestAffixCondition extends LuceneTestCase {
+
+ public void testPlainSuffixMatching() {
+ AffixCondition condition = AffixCondition.compile(SUFFIX, "b", "ab", "");
+ assertTrue(condition.acceptsStem("a"));
+ assertFalse(condition.acceptsStem("b"));
+ assertFalse(condition.acceptsStem("ab"));
+ }
+
+ public void testPlainPrefixMatching() {
+ AffixCondition condition = AffixCondition.compile(PREFIX, "a", "ab", "");
+ assertFalse(condition.acceptsStem("ab"));
+ assertTrue(condition.acceptsStem("b"));
+ assertFalse(condition.acceptsStem("a"));
+ }
+
+ public void testDotMatching() {
+ AffixCondition condition = AffixCondition.compile(PREFIX, "", "wr.", "");
+ assertTrue(condition.acceptsStem("wry"));
+ assertTrue(condition.acceptsStem("wrong"));
+ assertFalse(condition.acceptsStem("white"));
+ }
+
+ public void testUniqueKey() {
+ assertNotEquals(
+ AffixCondition.uniqueKey(PREFIX, "", "x"), AffixCondition.uniqueKey(SUFFIX, "", "x"));
+ assertNotEquals(
+ AffixCondition.uniqueKey(SUFFIX, "y", "x"), AffixCondition.uniqueKey(SUFFIX, "", "x"));
+ assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(PREFIX, "", "."));
+ assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(SUFFIX, "abc", "abc"));
+
+ assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(SUFFIX, "abc", "bc"));
+ assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(PREFIX, "abc", "ab"));
+ }
+
+ public void testConditionHasBracketsIntersectingWithStrip() {
+ assertTrue(AffixCondition.compile(SUFFIX, "oj", "[io]j", "").acceptsStem("whatever"));
+ assertTrue(AffixCondition.compile(SUFFIX, "oj", "o[ioj", "").acceptsStem("whatever"));
+ }
+
+ public void testImpossibleCondition() {
+ assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "a", "b", ""));
+ }
+
+ public void testNonHunspellPatternCharacters() {
+ assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "(^ax)", ""));
+ assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "(^.x)", ""));
+ assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "[z](^ax)", ""));
+ assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "(^ax)[z]", ""));
+ }
+}