diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java new file mode 100644 index 00000000000..be0d8ba2865 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +import static org.apache.lucene.analysis.hunspell.AffixKind.PREFIX; +import static org.apache.lucene.analysis.hunspell.AffixKind.SUFFIX; + +import java.util.regex.PatternSyntaxException; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; +import org.apache.lucene.util.automaton.RegExp; + +/** + * Checks the "condition" part of affix definition, as in + * + *
PFX flag stripping prefix [condition [morphological_fields...]]
+ */ +interface AffixCondition { + String ALWAYS_TRUE_KEY = ".*"; + AffixCondition ALWAYS_TRUE = (word, offset, length) -> true; + AffixCondition ALWAYS_FALSE = (word, offset, length) -> false; + + default boolean acceptsStem(String stem) { + return acceptsStem(stem.toCharArray(), 0, stem.length()); + } + + /** + * @return whether the given word matches this condition as a stem with both "strip" and "affix" + * removed + */ + boolean acceptsStem(char[] word, int offset, int length); + + /** + * @return a key used to deduplicate same condition+strip+kind triples. For trivial conditions + * that need no check, {@link #ALWAYS_TRUE_KEY} is returned. + */ + static String uniqueKey(AffixKind kind, String strip, String condition) { + if (".".equals(condition) + || kind == PREFIX && strip.startsWith(condition) + || kind == SUFFIX && strip.endsWith(condition) && !isRegexp(condition)) { + return ALWAYS_TRUE_KEY; + } + return condition + " " + kind + " " + strip; + } + + /** + * Analyzes the given affix kind, strip and condition and returns an object able to efficiently + * check that condition. + */ + static AffixCondition compile(AffixKind kind, String strip, String condition, String line) { + if (!isRegexp(condition)) { + if (kind == SUFFIX && condition.endsWith(strip)) { + return substringCondition( + kind, condition.substring(0, condition.length() - strip.length())); + } + if (kind == PREFIX && condition.startsWith(strip)) { + return substringCondition(kind, condition.substring(strip.length())); + } + return ALWAYS_FALSE; + } + + int lastBracket = condition.lastIndexOf('['); + if (lastBracket >= 0 && condition.indexOf(']', lastBracket + 1) < 0) { + // unclosed [ is tolerated by Hunspell and occurs in some dictionaries + condition = condition + "]"; + } + + try { + int conditionChars = countCharPatterns(condition); + if (conditionChars <= strip.length()) { + String regex = kind == PREFIX ? ".*" + condition : condition + ".*"; + return strip.matches(regex) ? ALWAYS_TRUE : ALWAYS_FALSE; + } + + if (kind == PREFIX) { + int split = skipCharPatterns(condition, strip.length()); + if (!strip.matches(condition.substring(0, split))) { + return ALWAYS_FALSE; + } + return regexpCondition(kind, condition.substring(split), conditionChars - strip.length()); + } + + int split = skipCharPatterns(condition, conditionChars - strip.length()); + if (!strip.matches(condition.substring(split))) { + return ALWAYS_FALSE; + } + return regexpCondition(kind, condition.substring(0, split), conditionChars - strip.length()); + } catch (PatternSyntaxException e) { + return ALWAYS_FALSE; + } catch (Throwable e) { + throw new IllegalArgumentException("On line: " + line, e); + } + } + + private static int skipCharPatterns(String condition, int count) { + int pos = 0; + for (int i = 0; i < count; i++) pos = skipCharPattern(condition, pos); + return pos; + } + + private static int countCharPatterns(String condition) { + int conditionChars = 0; + for (int i = 0; i < condition.length(); i = skipCharPattern(condition, i)) conditionChars++; + return conditionChars; + } + + private static int skipCharPattern(String condition, int pos) { + if (condition.charAt(pos) == '[') { + pos = condition.indexOf(']', pos + 1); + if (pos < 0) { + throw new AssertionError("Malformed condition " + condition); + } + } + return pos + 1; + } + + private static boolean isRegexp(String condition) { + return condition.contains("[") || condition.contains(".") || condition.contains("-"); + } + + private static AffixCondition substringCondition(AffixKind kind, String stemCondition) { + boolean forSuffix = kind == AffixKind.SUFFIX; + int condLength = stemCondition.length(); + return (word, offset, length) -> { + if (length < condLength) { + return false; + } + int matchStart = forSuffix ? offset + length - condLength : offset; + for (int i = 0; i < condLength; i++) { + if (stemCondition.charAt(i) != word[matchStart + i]) { + return false; + } + } + return true; + }; + } + + private static AffixCondition regexpCondition(AffixKind kind, String condition, int charCount) { + boolean forSuffix = kind == AffixKind.SUFFIX; + CharacterRunAutomaton automaton = + new CharacterRunAutomaton(new RegExp(escapeDash(condition), RegExp.NONE).toAutomaton()); + return (word, offset, length) -> + length >= charCount + && automaton.run(word, forSuffix ? offset + length - charCount : offset, charCount); + } + + // "dash hasn't got special meaning" (we must escape it) + private static String escapeDash(String re) { + if (!re.contains("-")) return re; + + // we have to be careful, even though dash doesn't have a special meaning, + // some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it + StringBuilder escaped = new StringBuilder(); + for (int i = 0; i < re.length(); i++) { + char c = re.charAt(i); + if (c == '-') { + escaped.append("\\-"); + } else { + escaped.append(c); + if (c == '\\' && i + 1 < re.length()) { + escaped.append(re.charAt(i + 1)); + i++; + } + } + } + return escaped.toString(); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixKind.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixKind.java new file mode 100644 index 00000000000..bd41673b5ab --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixKind.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +enum AffixKind { + PREFIX, + SUFFIX +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 0547c3e4ebc..bea4cf97dc4 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.analysis.hunspell; +import static org.apache.lucene.analysis.hunspell.AffixKind.*; + import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.ByteArrayInputStream; @@ -59,8 +61,6 @@ import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.OfflineSorter; import org.apache.lucene.util.OfflineSorter.ByteSequencesReader; import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter; -import org.apache.lucene.util.automaton.CharacterRunAutomaton; -import org.apache.lucene.util.automaton.RegExp; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.IntSequenceOutputs; @@ -89,7 +89,7 @@ public class Dictionary { * All condition checks used by prefixes and suffixes. these are typically re-used across many * affix stripping rules. so these are deduplicated, to save RAM. */ - ArrayList patterns = new ArrayList<>(); + ArrayList patterns = new ArrayList<>(); /** * The entries in the .dic file, mapping to their set of flags. the fst output is the ordinal list @@ -338,7 +338,7 @@ public class Dictionary { Map seenPatterns = new HashMap<>(); // zero condition -> 0 ord - seenPatterns.put(".*", 0); + seenPatterns.put(AffixCondition.ALWAYS_TRUE_KEY, 0); patterns.add(null); // zero strip -> 0 ord @@ -362,9 +362,11 @@ public class Dictionary { } else if ("AM".equals(firstWord)) { parseMorphAlias(line); } else if ("PFX".equals(firstWord)) { - parseAffix(prefixes, prefixContFlags, line, reader, false, seenPatterns, seenStrips, flags); + parseAffix( + prefixes, prefixContFlags, line, reader, PREFIX, seenPatterns, seenStrips, flags); } else if ("SFX".equals(firstWord)) { - parseAffix(suffixes, suffixContFlags, line, reader, true, seenPatterns, seenStrips, flags); + parseAffix( + suffixes, suffixContFlags, line, reader, SUFFIX, seenPatterns, seenStrips, flags); } else if (line.equals("COMPLEXPREFIXES")) { complexPrefixes = true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix @@ -655,25 +657,6 @@ public class Dictionary { return fstCompiler.compile(); } - static String escapeDash(String re) { - // we have to be careful, even though dash doesn't have a special meaning, - // some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it - StringBuilder escaped = new StringBuilder(); - for (int i = 0; i < re.length(); i++) { - char c = re.charAt(i); - if (c == '-') { - escaped.append("\\-"); - } else { - escaped.append(c); - if (c == '\\' && i + 1 < re.length()) { - escaped.append(re.charAt(i + 1)); - i++; - } - } - } - return escaped.toString(); - } - /** * Parses a specific affix rule putting the result into the provided affix map * @@ -688,7 +671,7 @@ public class Dictionary { Set secondStageFlags, String header, LineNumberReader reader, - boolean isSuffix, + AffixKind kind, Map seenPatterns, Map seenStrips, FlagEnumerator flags) @@ -738,41 +721,18 @@ public class Dictionary { } String condition = ruleArgs.length > 4 ? ruleArgs[4] : "."; - // at least the gascon affix file has this issue - if (condition.startsWith("[") && condition.indexOf(']') == -1) { - condition = condition + "]"; - } - // "dash hasn't got special meaning" (we must escape it) - if (condition.indexOf('-') >= 0) { - condition = escapeDash(condition); - } - - final String regex; - if (".".equals(condition)) { - regex = ".*"; // Zero condition is indicated by dot - } else if (condition.equals(strip)) { - regex = ".*"; // TODO: optimize this better: - // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! - // but this is complicated... - } else { - // TODO: really for suffixes we should reverse the automaton and run them backwards - regex = isSuffix ? ".*" + condition : condition + ".*"; - } + String key = AffixCondition.uniqueKey(kind, strip, condition); // deduplicate patterns - Integer patternIndex = seenPatterns.get(regex); + Integer patternIndex = seenPatterns.get(key); if (patternIndex == null) { patternIndex = patterns.size(); if (patternIndex > Short.MAX_VALUE) { throw new UnsupportedOperationException( "Too many patterns, please report this to dev@lucene.apache.org"); } - seenPatterns.put(regex, patternIndex); - try { - patterns.add(new CharacterRunAutomaton(conditionRegexp(regex).toAutomaton())); - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException("On line " + reader.getLineNumber() + ": " + line, e); - } + seenPatterns.put(key, patternIndex); + patterns.add(AffixCondition.compile(kind, strip, condition, line)); } Integer stripOrd = seenStrips.get(strip); @@ -811,7 +771,7 @@ public class Dictionary { affixArg = cleanInput(affixArg, sb).toString(); } - if (isSuffix) { + if (kind == SUFFIX) { affixArg = new StringBuilder(affixArg).reverse().toString(); } @@ -820,17 +780,6 @@ public class Dictionary { } } - private static RegExp conditionRegexp(String regex) { - try { - return new RegExp(regex, RegExp.NONE); - } catch (IllegalArgumentException e) { - if (e.getMessage().contains("expected ']'")) { - return conditionRegexp(regex + "]"); - } - throw e; - } - } - char affixData(int affixIndex, int offset) { return affixData[affixIndex * 4 + offset]; } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java index cb5286132fc..90304594ad6 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java @@ -269,7 +269,7 @@ class GeneratingSuggester { private boolean checkAffixCondition(int suffixId, String stem) { int condition = dictionary.getAffixCondition(suffixId); - return condition == 0 || dictionary.patterns.get(condition).run(stem); + return condition == 0 || dictionary.patterns.get(condition).acceptsStem(stem); } private int affixStripLength(int affixId) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index 453211bc213..f864dee2db3 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -24,7 +24,6 @@ import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.fst.FST; /** @@ -486,13 +485,12 @@ final class Stemmer { int stripLen = stripEnd - stripStart; char[] stripData = dictionary.stripData; - boolean condition = - isPrefix - ? checkCondition( - affix, stripData, stripStart, stripLen, word, offset + affixLen, deAffixedLen) - : checkCondition(affix, word, offset, deAffixedLen, stripData, stripStart, stripLen); - if (!condition) { - return null; + int condition = dictionary.getAffixCondition(affix); + if (condition != 0) { + int deAffixedOffset = isPrefix ? offset + affixLen : offset; + if (!dictionary.patterns.get(condition).acceptsStem(word, deAffixedOffset, deAffixedLen)) { + return null; + } } if (stripLen == 0) return word; @@ -547,33 +545,6 @@ final class Stemmer { return false; } - /** checks condition of the concatenation of two strings */ - // note: this is pretty stupid, we really should subtract strip from the condition up front and - // just check the stem - // but this is a little bit more complicated. - private boolean checkCondition( - int affix, char[] c1, int c1off, int c1len, char[] c2, int c2off, int c2len) { - int condition = dictionary.getAffixCondition(affix); - if (condition != 0) { - CharacterRunAutomaton pattern = dictionary.patterns.get(condition); - int state = 0; - for (int i = c1off; i < c1off + c1len; i++) { - state = pattern.step(state, c1[i]); - if (state == -1) { - return false; - } - } - for (int i = c2off; i < c2off + c2len; i++) { - state = pattern.step(state, c2[i]); - if (state == -1) { - return false; - } - } - return pattern.isAccept(state); - } - return true; - } - /** * Applies the affix rule to the given word, producing a list of stems if any are found * diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAffixCondition.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAffixCondition.java new file mode 100644 index 00000000000..c419040f92e --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAffixCondition.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +import static org.apache.lucene.analysis.hunspell.AffixCondition.ALWAYS_FALSE; +import static org.apache.lucene.analysis.hunspell.AffixCondition.ALWAYS_TRUE_KEY; +import static org.apache.lucene.analysis.hunspell.AffixKind.PREFIX; +import static org.apache.lucene.analysis.hunspell.AffixKind.SUFFIX; + +import org.apache.lucene.util.LuceneTestCase; + +public class TestAffixCondition extends LuceneTestCase { + + public void testPlainSuffixMatching() { + AffixCondition condition = AffixCondition.compile(SUFFIX, "b", "ab", ""); + assertTrue(condition.acceptsStem("a")); + assertFalse(condition.acceptsStem("b")); + assertFalse(condition.acceptsStem("ab")); + } + + public void testPlainPrefixMatching() { + AffixCondition condition = AffixCondition.compile(PREFIX, "a", "ab", ""); + assertFalse(condition.acceptsStem("ab")); + assertTrue(condition.acceptsStem("b")); + assertFalse(condition.acceptsStem("a")); + } + + public void testDotMatching() { + AffixCondition condition = AffixCondition.compile(PREFIX, "", "wr.", ""); + assertTrue(condition.acceptsStem("wry")); + assertTrue(condition.acceptsStem("wrong")); + assertFalse(condition.acceptsStem("white")); + } + + public void testUniqueKey() { + assertNotEquals( + AffixCondition.uniqueKey(PREFIX, "", "x"), AffixCondition.uniqueKey(SUFFIX, "", "x")); + assertNotEquals( + AffixCondition.uniqueKey(SUFFIX, "y", "x"), AffixCondition.uniqueKey(SUFFIX, "", "x")); + assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(PREFIX, "", ".")); + assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(SUFFIX, "abc", "abc")); + + assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(SUFFIX, "abc", "bc")); + assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(PREFIX, "abc", "ab")); + } + + public void testConditionHasBracketsIntersectingWithStrip() { + assertTrue(AffixCondition.compile(SUFFIX, "oj", "[io]j", "").acceptsStem("whatever")); + assertTrue(AffixCondition.compile(SUFFIX, "oj", "o[ioj", "").acceptsStem("whatever")); + } + + public void testImpossibleCondition() { + assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "a", "b", "")); + } + + public void testNonHunspellPatternCharacters() { + assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "(^ax)", "")); + assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "(^.x)", "")); + assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "[z](^ax)", "")); + assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "(^ax)[z]", "")); + } +}