LUCENE-9806: Hunspell: speed up affix condition checking (#2423)

* LUCENE-9806: Hunspell: speed up affix condition checking

check only stem beginning/end without strip/condition, not the whole candidate
avoid regexp if possible

* hunspell: simplify AffixCondition, add more tests

* add a license to the test
This commit is contained in:
Peter Gromov 2021-02-24 17:45:35 +01:00 committed by GitHub
parent e1ff4c1354
commit 3a99e2aa82
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 300 additions and 101 deletions

View File

@ -0,0 +1,181 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import static org.apache.lucene.analysis.hunspell.AffixKind.PREFIX;
import static org.apache.lucene.analysis.hunspell.AffixKind.SUFFIX;
import java.util.regex.PatternSyntaxException;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
/**
* Checks the "condition" part of affix definition, as in
*
* <pre>PFX flag stripping prefix [condition [morphological_fields...]]</pre>
*/
interface AffixCondition {
String ALWAYS_TRUE_KEY = ".*";
AffixCondition ALWAYS_TRUE = (word, offset, length) -> true;
AffixCondition ALWAYS_FALSE = (word, offset, length) -> false;
default boolean acceptsStem(String stem) {
return acceptsStem(stem.toCharArray(), 0, stem.length());
}
/**
* @return whether the given word matches this condition as a stem with both "strip" and "affix"
* removed
*/
boolean acceptsStem(char[] word, int offset, int length);
/**
* @return a key used to deduplicate same condition+strip+kind triples. For trivial conditions
* that need no check, {@link #ALWAYS_TRUE_KEY} is returned.
*/
static String uniqueKey(AffixKind kind, String strip, String condition) {
if (".".equals(condition)
|| kind == PREFIX && strip.startsWith(condition)
|| kind == SUFFIX && strip.endsWith(condition) && !isRegexp(condition)) {
return ALWAYS_TRUE_KEY;
}
return condition + " " + kind + " " + strip;
}
/**
* Analyzes the given affix kind, strip and condition and returns an object able to efficiently
* check that condition.
*/
static AffixCondition compile(AffixKind kind, String strip, String condition, String line) {
if (!isRegexp(condition)) {
if (kind == SUFFIX && condition.endsWith(strip)) {
return substringCondition(
kind, condition.substring(0, condition.length() - strip.length()));
}
if (kind == PREFIX && condition.startsWith(strip)) {
return substringCondition(kind, condition.substring(strip.length()));
}
return ALWAYS_FALSE;
}
int lastBracket = condition.lastIndexOf('[');
if (lastBracket >= 0 && condition.indexOf(']', lastBracket + 1) < 0) {
// unclosed [ is tolerated by Hunspell and occurs in some dictionaries
condition = condition + "]";
}
try {
int conditionChars = countCharPatterns(condition);
if (conditionChars <= strip.length()) {
String regex = kind == PREFIX ? ".*" + condition : condition + ".*";
return strip.matches(regex) ? ALWAYS_TRUE : ALWAYS_FALSE;
}
if (kind == PREFIX) {
int split = skipCharPatterns(condition, strip.length());
if (!strip.matches(condition.substring(0, split))) {
return ALWAYS_FALSE;
}
return regexpCondition(kind, condition.substring(split), conditionChars - strip.length());
}
int split = skipCharPatterns(condition, conditionChars - strip.length());
if (!strip.matches(condition.substring(split))) {
return ALWAYS_FALSE;
}
return regexpCondition(kind, condition.substring(0, split), conditionChars - strip.length());
} catch (PatternSyntaxException e) {
return ALWAYS_FALSE;
} catch (Throwable e) {
throw new IllegalArgumentException("On line: " + line, e);
}
}
private static int skipCharPatterns(String condition, int count) {
int pos = 0;
for (int i = 0; i < count; i++) pos = skipCharPattern(condition, pos);
return pos;
}
private static int countCharPatterns(String condition) {
int conditionChars = 0;
for (int i = 0; i < condition.length(); i = skipCharPattern(condition, i)) conditionChars++;
return conditionChars;
}
private static int skipCharPattern(String condition, int pos) {
if (condition.charAt(pos) == '[') {
pos = condition.indexOf(']', pos + 1);
if (pos < 0) {
throw new AssertionError("Malformed condition " + condition);
}
}
return pos + 1;
}
private static boolean isRegexp(String condition) {
return condition.contains("[") || condition.contains(".") || condition.contains("-");
}
private static AffixCondition substringCondition(AffixKind kind, String stemCondition) {
boolean forSuffix = kind == AffixKind.SUFFIX;
int condLength = stemCondition.length();
return (word, offset, length) -> {
if (length < condLength) {
return false;
}
int matchStart = forSuffix ? offset + length - condLength : offset;
for (int i = 0; i < condLength; i++) {
if (stemCondition.charAt(i) != word[matchStart + i]) {
return false;
}
}
return true;
};
}
private static AffixCondition regexpCondition(AffixKind kind, String condition, int charCount) {
boolean forSuffix = kind == AffixKind.SUFFIX;
CharacterRunAutomaton automaton =
new CharacterRunAutomaton(new RegExp(escapeDash(condition), RegExp.NONE).toAutomaton());
return (word, offset, length) ->
length >= charCount
&& automaton.run(word, forSuffix ? offset + length - charCount : offset, charCount);
}
// "dash hasn't got special meaning" (we must escape it)
private static String escapeDash(String re) {
if (!re.contains("-")) return re;
// we have to be careful, even though dash doesn't have a special meaning,
// some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it
StringBuilder escaped = new StringBuilder();
for (int i = 0; i < re.length(); i++) {
char c = re.charAt(i);
if (c == '-') {
escaped.append("\\-");
} else {
escaped.append(c);
if (c == '\\' && i + 1 < re.length()) {
escaped.append(re.charAt(i + 1));
i++;
}
}
}
return escaped.toString();
}
}

View File

@ -0,0 +1,22 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
enum AffixKind {
PREFIX,
SUFFIX
}

View File

@ -16,6 +16,8 @@
*/ */
package org.apache.lucene.analysis.hunspell; package org.apache.lucene.analysis.hunspell;
import static org.apache.lucene.analysis.hunspell.AffixKind.*;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
@ -59,8 +61,6 @@ import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.OfflineSorter; import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader; import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter; import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.IntSequenceOutputs; import org.apache.lucene.util.fst.IntSequenceOutputs;
@ -89,7 +89,7 @@ public class Dictionary {
* All condition checks used by prefixes and suffixes. these are typically re-used across many * All condition checks used by prefixes and suffixes. these are typically re-used across many
* affix stripping rules. so these are deduplicated, to save RAM. * affix stripping rules. so these are deduplicated, to save RAM.
*/ */
ArrayList<CharacterRunAutomaton> patterns = new ArrayList<>(); ArrayList<AffixCondition> patterns = new ArrayList<>();
/** /**
* The entries in the .dic file, mapping to their set of flags. the fst output is the ordinal list * The entries in the .dic file, mapping to their set of flags. the fst output is the ordinal list
@ -338,7 +338,7 @@ public class Dictionary {
Map<String, Integer> seenPatterns = new HashMap<>(); Map<String, Integer> seenPatterns = new HashMap<>();
// zero condition -> 0 ord // zero condition -> 0 ord
seenPatterns.put(".*", 0); seenPatterns.put(AffixCondition.ALWAYS_TRUE_KEY, 0);
patterns.add(null); patterns.add(null);
// zero strip -> 0 ord // zero strip -> 0 ord
@ -362,9 +362,11 @@ public class Dictionary {
} else if ("AM".equals(firstWord)) { } else if ("AM".equals(firstWord)) {
parseMorphAlias(line); parseMorphAlias(line);
} else if ("PFX".equals(firstWord)) { } else if ("PFX".equals(firstWord)) {
parseAffix(prefixes, prefixContFlags, line, reader, false, seenPatterns, seenStrips, flags); parseAffix(
prefixes, prefixContFlags, line, reader, PREFIX, seenPatterns, seenStrips, flags);
} else if ("SFX".equals(firstWord)) { } else if ("SFX".equals(firstWord)) {
parseAffix(suffixes, suffixContFlags, line, reader, true, seenPatterns, seenStrips, flags); parseAffix(
suffixes, suffixContFlags, line, reader, SUFFIX, seenPatterns, seenStrips, flags);
} else if (line.equals("COMPLEXPREFIXES")) { } else if (line.equals("COMPLEXPREFIXES")) {
complexPrefixes = complexPrefixes =
true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
@ -655,25 +657,6 @@ public class Dictionary {
return fstCompiler.compile(); return fstCompiler.compile();
} }
static String escapeDash(String re) {
// we have to be careful, even though dash doesn't have a special meaning,
// some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it
StringBuilder escaped = new StringBuilder();
for (int i = 0; i < re.length(); i++) {
char c = re.charAt(i);
if (c == '-') {
escaped.append("\\-");
} else {
escaped.append(c);
if (c == '\\' && i + 1 < re.length()) {
escaped.append(re.charAt(i + 1));
i++;
}
}
}
return escaped.toString();
}
/** /**
* Parses a specific affix rule putting the result into the provided affix map * Parses a specific affix rule putting the result into the provided affix map
* *
@ -688,7 +671,7 @@ public class Dictionary {
Set<Character> secondStageFlags, Set<Character> secondStageFlags,
String header, String header,
LineNumberReader reader, LineNumberReader reader,
boolean isSuffix, AffixKind kind,
Map<String, Integer> seenPatterns, Map<String, Integer> seenPatterns,
Map<String, Integer> seenStrips, Map<String, Integer> seenStrips,
FlagEnumerator flags) FlagEnumerator flags)
@ -738,41 +721,18 @@ public class Dictionary {
} }
String condition = ruleArgs.length > 4 ? ruleArgs[4] : "."; String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
// at least the gascon affix file has this issue String key = AffixCondition.uniqueKey(kind, strip, condition);
if (condition.startsWith("[") && condition.indexOf(']') == -1) {
condition = condition + "]";
}
// "dash hasn't got special meaning" (we must escape it)
if (condition.indexOf('-') >= 0) {
condition = escapeDash(condition);
}
final String regex;
if (".".equals(condition)) {
regex = ".*"; // Zero condition is indicated by dot
} else if (condition.equals(strip)) {
regex = ".*"; // TODO: optimize this better:
// if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
// but this is complicated...
} else {
// TODO: really for suffixes we should reverse the automaton and run them backwards
regex = isSuffix ? ".*" + condition : condition + ".*";
}
// deduplicate patterns // deduplicate patterns
Integer patternIndex = seenPatterns.get(regex); Integer patternIndex = seenPatterns.get(key);
if (patternIndex == null) { if (patternIndex == null) {
patternIndex = patterns.size(); patternIndex = patterns.size();
if (patternIndex > Short.MAX_VALUE) { if (patternIndex > Short.MAX_VALUE) {
throw new UnsupportedOperationException( throw new UnsupportedOperationException(
"Too many patterns, please report this to dev@lucene.apache.org"); "Too many patterns, please report this to dev@lucene.apache.org");
} }
seenPatterns.put(regex, patternIndex); seenPatterns.put(key, patternIndex);
try { patterns.add(AffixCondition.compile(kind, strip, condition, line));
patterns.add(new CharacterRunAutomaton(conditionRegexp(regex).toAutomaton()));
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException("On line " + reader.getLineNumber() + ": " + line, e);
}
} }
Integer stripOrd = seenStrips.get(strip); Integer stripOrd = seenStrips.get(strip);
@ -811,7 +771,7 @@ public class Dictionary {
affixArg = cleanInput(affixArg, sb).toString(); affixArg = cleanInput(affixArg, sb).toString();
} }
if (isSuffix) { if (kind == SUFFIX) {
affixArg = new StringBuilder(affixArg).reverse().toString(); affixArg = new StringBuilder(affixArg).reverse().toString();
} }
@ -820,17 +780,6 @@ public class Dictionary {
} }
} }
private static RegExp conditionRegexp(String regex) {
try {
return new RegExp(regex, RegExp.NONE);
} catch (IllegalArgumentException e) {
if (e.getMessage().contains("expected ']'")) {
return conditionRegexp(regex + "]");
}
throw e;
}
}
char affixData(int affixIndex, int offset) { char affixData(int affixIndex, int offset) {
return affixData[affixIndex * 4 + offset]; return affixData[affixIndex * 4 + offset];
} }

View File

@ -269,7 +269,7 @@ class GeneratingSuggester {
private boolean checkAffixCondition(int suffixId, String stem) { private boolean checkAffixCondition(int suffixId, String stem) {
int condition = dictionary.getAffixCondition(suffixId); int condition = dictionary.getAffixCondition(suffixId);
return condition == 0 || dictionary.patterns.get(condition).run(stem); return condition == 0 || dictionary.patterns.get(condition).acceptsStem(stem);
} }
private int affixStripLength(int affixId) { private int affixStripLength(int affixId) {

View File

@ -24,7 +24,6 @@ import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
/** /**
@ -486,13 +485,12 @@ final class Stemmer {
int stripLen = stripEnd - stripStart; int stripLen = stripEnd - stripStart;
char[] stripData = dictionary.stripData; char[] stripData = dictionary.stripData;
boolean condition = int condition = dictionary.getAffixCondition(affix);
isPrefix if (condition != 0) {
? checkCondition( int deAffixedOffset = isPrefix ? offset + affixLen : offset;
affix, stripData, stripStart, stripLen, word, offset + affixLen, deAffixedLen) if (!dictionary.patterns.get(condition).acceptsStem(word, deAffixedOffset, deAffixedLen)) {
: checkCondition(affix, word, offset, deAffixedLen, stripData, stripStart, stripLen); return null;
if (!condition) { }
return null;
} }
if (stripLen == 0) return word; if (stripLen == 0) return word;
@ -547,33 +545,6 @@ final class Stemmer {
return false; return false;
} }
/** checks condition of the concatenation of two strings */
// note: this is pretty stupid, we really should subtract strip from the condition up front and
// just check the stem
// but this is a little bit more complicated.
private boolean checkCondition(
int affix, char[] c1, int c1off, int c1len, char[] c2, int c2off, int c2len) {
int condition = dictionary.getAffixCondition(affix);
if (condition != 0) {
CharacterRunAutomaton pattern = dictionary.patterns.get(condition);
int state = 0;
for (int i = c1off; i < c1off + c1len; i++) {
state = pattern.step(state, c1[i]);
if (state == -1) {
return false;
}
}
for (int i = c2off; i < c2off + c2len; i++) {
state = pattern.step(state, c2[i]);
if (state == -1) {
return false;
}
}
return pattern.isAccept(state);
}
return true;
}
/** /**
* Applies the affix rule to the given word, producing a list of stems if any are found * Applies the affix rule to the given word, producing a list of stems if any are found
* *

View File

@ -0,0 +1,76 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import static org.apache.lucene.analysis.hunspell.AffixCondition.ALWAYS_FALSE;
import static org.apache.lucene.analysis.hunspell.AffixCondition.ALWAYS_TRUE_KEY;
import static org.apache.lucene.analysis.hunspell.AffixKind.PREFIX;
import static org.apache.lucene.analysis.hunspell.AffixKind.SUFFIX;
import org.apache.lucene.util.LuceneTestCase;
public class TestAffixCondition extends LuceneTestCase {
public void testPlainSuffixMatching() {
AffixCondition condition = AffixCondition.compile(SUFFIX, "b", "ab", "");
assertTrue(condition.acceptsStem("a"));
assertFalse(condition.acceptsStem("b"));
assertFalse(condition.acceptsStem("ab"));
}
public void testPlainPrefixMatching() {
AffixCondition condition = AffixCondition.compile(PREFIX, "a", "ab", "");
assertFalse(condition.acceptsStem("ab"));
assertTrue(condition.acceptsStem("b"));
assertFalse(condition.acceptsStem("a"));
}
public void testDotMatching() {
AffixCondition condition = AffixCondition.compile(PREFIX, "", "wr.", "");
assertTrue(condition.acceptsStem("wry"));
assertTrue(condition.acceptsStem("wrong"));
assertFalse(condition.acceptsStem("white"));
}
public void testUniqueKey() {
assertNotEquals(
AffixCondition.uniqueKey(PREFIX, "", "x"), AffixCondition.uniqueKey(SUFFIX, "", "x"));
assertNotEquals(
AffixCondition.uniqueKey(SUFFIX, "y", "x"), AffixCondition.uniqueKey(SUFFIX, "", "x"));
assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(PREFIX, "", "."));
assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(SUFFIX, "abc", "abc"));
assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(SUFFIX, "abc", "bc"));
assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(PREFIX, "abc", "ab"));
}
public void testConditionHasBracketsIntersectingWithStrip() {
assertTrue(AffixCondition.compile(SUFFIX, "oj", "[io]j", "").acceptsStem("whatever"));
assertTrue(AffixCondition.compile(SUFFIX, "oj", "o[ioj", "").acceptsStem("whatever"));
}
public void testImpossibleCondition() {
assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "a", "b", ""));
}
public void testNonHunspellPatternCharacters() {
assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "(^ax)", ""));
assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "(^.x)", ""));
assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "[z](^ax)", ""));
assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "(^ax)[z]", ""));
}
}