LUCENE-9778: Hunspell: speed up input conversion (#2376)

This commit is contained in:
Peter Gromov 2021-02-17 09:10:40 +01:00 committed by GitHub
parent 2d53c6073b
commit 2ae45cc985
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 169 additions and 137 deletions

View File

@ -0,0 +1,110 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import java.io.IOException;
import java.util.Map;
import java.util.TreeMap;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.CharSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util;
/** ICONV or OCONV replacement table */
class ConvTable {
private final FST<CharsRef> fst;
private final FixedBitSet firstCharHashes;
private final int mod;
ConvTable(TreeMap<String, String> mappings) {
mod = Math.max(256, Integer.highestOneBit(mappings.size()) << 1);
firstCharHashes = new FixedBitSet(mod);
try {
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
IntsRefBuilder scratchInts = new IntsRefBuilder();
for (Map.Entry<String, String> entry : mappings.entrySet()) {
String key = entry.getKey();
assert key.length() > 0;
firstCharHashes.set(key.charAt(0) % mod);
Util.toUTF16(key, scratchInts);
fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
}
fst = fstCompiler.compile();
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
void applyMappings(StringBuilder sb) {
FST.BytesReader bytesReader = null;
FST.Arc<CharsRef> firstArc = null;
FST.Arc<CharsRef> arc = null;
int longestMatch;
CharsRef longestOutput;
for (int i = 0; i < sb.length(); i++) {
if (!mightReplaceChar(sb.charAt(i))) {
continue;
}
if (firstArc == null) {
firstArc = fst.getFirstArc(new FST.Arc<>());
bytesReader = fst.getBytesReader();
arc = new FST.Arc<>();
}
arc.copyFrom(firstArc);
CharsRef output = fst.outputs.getNoOutput();
longestMatch = -1;
longestOutput = null;
for (int j = i; j < sb.length(); j++) {
char ch = sb.charAt(j);
try {
if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
break;
}
output = fst.outputs.add(output, arc.output());
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
if (arc.isFinal()) {
longestOutput = fst.outputs.add(output, arc.nextFinalOutput());
longestMatch = j;
}
}
if (longestMatch >= 0) {
sb.delete(i, longestMatch + 1);
sb.insert(i, longestOutput);
i += (longestOutput.length - 1);
}
}
}
boolean mightReplaceChar(char c) {
return firstCharHashes.get(c % mod);
}
}

View File

@ -51,7 +51,6 @@ import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
@ -60,11 +59,9 @@ import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.fst.CharSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.IntSequenceOutputs;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util;
/** In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary. */
@ -172,13 +169,7 @@ public class Dictionary {
int maxNGramSuggestions = Integer.MAX_VALUE;
boolean onlyMaxDiff;
char noSuggest, subStandard;
// FSTs used for ICONV/OCONV, output ord pointing to replacement text
FST<CharsRef> iconv;
FST<CharsRef> oconv;
boolean needsInputCleaning;
boolean needsOutputCleaning;
ConvTable iconv, oconv;
// true if we can strip suffixes "down to nothing"
boolean fullStrip;
@ -224,8 +215,6 @@ public class Dictionary {
boolean ignoreCase)
throws IOException, ParseException {
this.ignoreCase = ignoreCase;
this.needsInputCleaning = ignoreCase;
this.needsOutputCleaning = false; // set if we have an OCONV
try (BufferedInputStream affixStream =
new BufferedInputStream(affix, MAX_PROLOGUE_SCAN_WINDOW) {
@ -379,16 +368,13 @@ public class Dictionary {
} else if ("IGNORE".equals(firstWord)) {
ignore = singleArgument(reader, line).toCharArray();
Arrays.sort(ignore);
needsInputCleaning = true;
} else if ("ICONV".equals(firstWord) || "OCONV".equals(firstWord)) {
int num = parseNum(reader, line);
FST<CharsRef> res = parseConversions(reader, num);
ConvTable res = parseConversions(reader, num);
if (line.startsWith("I")) {
iconv = res;
needsInputCleaning |= iconv != null;
} else {
oconv = res;
needsOutputCleaning |= oconv != null;
}
} else if ("FULLSTRIP".equals(firstWord)) {
fullStrip = true;
@ -803,9 +789,8 @@ public class Dictionary {
affixData[dataStart + AFFIX_CONDITION] = (char) patternOrd;
affixData[dataStart + AFFIX_APPEND] = (char) appendFlagsOrd;
if (needsInputCleaning) {
CharSequence cleaned = cleanInput(affixArg, sb);
affixArg = cleaned.toString();
if (needsInputCleaning(affixArg)) {
affixArg = cleanInput(affixArg, sb).toString();
}
if (isSuffix) {
@ -840,9 +825,9 @@ public class Dictionary {
return affixData(affix, AFFIX_CONDITION) >>> 1;
}
private FST<CharsRef> parseConversions(LineNumberReader reader, int num)
private ConvTable parseConversions(LineNumberReader reader, int num)
throws IOException, ParseException {
Map<String, String> mappings = new TreeMap<>();
TreeMap<String, String> mappings = new TreeMap<>();
for (int i = 0; i < num; i++) {
String[] parts = splitBySpace(reader, reader.readLine(), 3);
@ -851,15 +836,7 @@ public class Dictionary {
}
}
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
IntsRefBuilder scratchInts = new IntsRefBuilder();
for (Map.Entry<String, String> entry : mappings.entrySet()) {
Util.toUTF16(entry.getKey(), scratchInts);
fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
}
return fstCompiler.compile();
return new ConvTable(mappings);
}
private static final byte[] BOM_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};
@ -1085,8 +1062,9 @@ public class Dictionary {
int sep = flagSep < 0 ? morphSep : flagSep;
CharSequence toWrite;
if (needsInputCleaning) {
cleanInput(line, sep, reuse);
String beforeSep = line.substring(0, sep);
if (needsInputCleaning(beforeSep)) {
cleanInput(beforeSep, reuse);
reuse.append(line, sep, line.length());
toWrite = reuse;
} else {
@ -1571,14 +1549,28 @@ public class Dictionary {
return flagLookup.hasFlag(entryId, flag);
}
CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
return cleanInput(input, input.length(), reuse);
boolean mayNeedInputCleaning() {
return ignoreCase || ignore != null || iconv != null;
}
private CharSequence cleanInput(CharSequence input, int prefixLength, StringBuilder reuse) {
boolean needsInputCleaning(CharSequence input) {
if (mayNeedInputCleaning()) {
for (int i = 0; i < input.length(); i++) {
char ch = input.charAt(i);
if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0
|| ignoreCase && caseFold(ch) != ch
|| iconv != null && iconv.mightReplaceChar(ch)) {
return true;
}
}
}
return false;
}
CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
reuse.setLength(0);
for (int i = 0; i < prefixLength; i++) {
for (int i = 0; i < input.length(); i++) {
char ch = input.charAt(i);
if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) {
@ -1594,11 +1586,7 @@ public class Dictionary {
}
if (iconv != null) {
try {
applyMappings(iconv, reuse);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
iconv.applyMappings(reuse);
if (ignoreCase) {
for (int i = 0; i < reuse.length(); i++) {
reuse.setCharAt(i, caseFold(reuse.charAt(i)));
@ -1624,44 +1612,6 @@ public class Dictionary {
}
}
// TODO: this could be more efficient!
static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
final FST.BytesReader bytesReader = fst.getBytesReader();
final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<>());
final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();
// temporary stuff
final FST.Arc<CharsRef> arc = new FST.Arc<>();
int longestMatch;
CharsRef longestOutput;
for (int i = 0; i < sb.length(); i++) {
arc.copyFrom(firstArc);
CharsRef output = NO_OUTPUT;
longestMatch = -1;
longestOutput = null;
for (int j = i; j < sb.length(); j++) {
char ch = sb.charAt(j);
if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
break;
} else {
output = fst.outputs.add(output, arc.output());
}
if (arc.isFinal()) {
longestOutput = fst.outputs.add(output, arc.nextFinalOutput());
longestMatch = j;
}
}
if (longestMatch >= 0) {
sb.delete(i, longestMatch + 1);
sb.insert(i, longestOutput);
i += (longestOutput.length - 1);
}
}
}
/** Returns true if this dictionary was constructed with the {@code ignoreCase} option */
public boolean getIgnoreCase() {
return ignoreCase;

View File

@ -22,7 +22,6 @@ import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashSet;
@ -72,7 +71,7 @@ public class Hunspell {
checkCanceled.run();
if (word.isEmpty()) return true;
if (dictionary.needsInputCleaning) {
if (dictionary.needsInputCleaning(word)) {
word = dictionary.cleanInput(word, new StringBuilder()).toString();
}
@ -479,7 +478,7 @@ public class Hunspell {
checkCanceled.run();
if (word.length() >= 100) return Collections.emptyList();
if (dictionary.needsInputCleaning) {
if (dictionary.needsInputCleaning(word)) {
word = dictionary.cleanInput(word, new StringBuilder()).toString();
}
@ -565,14 +564,10 @@ public class Hunspell {
}
private String cleanOutput(String s) {
if (!dictionary.needsOutputCleaning) return s;
if (dictionary.oconv == null) return s;
try {
StringBuilder sb = new StringBuilder(s);
Dictionary.applyMappings(dictionary.oconv, sb);
dictionary.oconv.applyMappings(sb);
return sb.toString();
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
}

View File

@ -83,15 +83,17 @@ final class Stemmer {
*/
public List<CharsRef> stem(char[] word, int length) {
if (dictionary.needsInputCleaning) {
if (dictionary.mayNeedInputCleaning()) {
scratchSegment.setLength(0);
scratchSegment.append(word, 0, length);
if (dictionary.needsInputCleaning(scratchSegment)) {
CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment);
scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length());
length = segment.length();
segment.getChars(0, length, scratchBuffer, 0);
word = scratchBuffer;
}
}
List<CharsRef> list = new ArrayList<>();
RootProcessor processor =
@ -365,18 +367,14 @@ final class Stemmer {
private CharsRef newStem(CharsRef stem, int morphDataId) {
String exception = stemException(morphDataId);
if (dictionary.needsOutputCleaning) {
if (dictionary.oconv != null) {
scratchSegment.setLength(0);
if (exception != null) {
scratchSegment.append(exception);
} else {
scratchSegment.append(stem.chars, stem.offset, stem.length);
}
try {
Dictionary.applyMappings(dictionary.oconv, scratchSegment);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
dictionary.oconv.applyMappings(scratchSegment);
char[] cleaned = new char[scratchSegment.length()];
scratchSegment.getChars(0, cleaned.length, cleaned, 0);
return new CharsRef(cleaned, 0, cleaned.length);

View File

@ -24,19 +24,13 @@ import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.Arrays;
import java.util.Collections;
import java.util.TreeMap;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.fst.CharSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util;
import org.junit.Test;
public class TestDictionary extends LuceneTestCase {
@ -166,51 +160,36 @@ public class TestDictionary extends LuceneTestCase {
assertTrue(dictStream.isClosed());
}
public void testReplacements() throws Exception {
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
IntsRefBuilder scratchInts = new IntsRefBuilder();
// a -> b
Util.toUTF16("a", scratchInts);
fstCompiler.add(scratchInts.get(), new CharsRef("b"));
// ab -> c
Util.toUTF16("ab", scratchInts);
fstCompiler.add(scratchInts.get(), new CharsRef("c"));
// c -> de
Util.toUTF16("c", scratchInts);
fstCompiler.add(scratchInts.get(), new CharsRef("de"));
// def -> gh
Util.toUTF16("def", scratchInts);
fstCompiler.add(scratchInts.get(), new CharsRef("gh"));
FST<CharsRef> fst = fstCompiler.compile();
public void testReplacements() {
TreeMap<String, String> map = new TreeMap<>();
map.put("a", "b");
map.put("ab", "c");
map.put("c", "de");
map.put("def", "gh");
ConvTable table = new ConvTable(map);
StringBuilder sb = new StringBuilder("atestanother");
Dictionary.applyMappings(fst, sb);
table.applyMappings(sb);
assertEquals("btestbnother", sb.toString());
sb = new StringBuilder("abtestanother");
Dictionary.applyMappings(fst, sb);
table.applyMappings(sb);
assertEquals("ctestbnother", sb.toString());
sb = new StringBuilder("atestabnother");
Dictionary.applyMappings(fst, sb);
table.applyMappings(sb);
assertEquals("btestcnother", sb.toString());
sb = new StringBuilder("abtestabnother");
Dictionary.applyMappings(fst, sb);
table.applyMappings(sb);
assertEquals("ctestcnother", sb.toString());
sb = new StringBuilder("abtestabcnother");
Dictionary.applyMappings(fst, sb);
table.applyMappings(sb);
assertEquals("ctestcdenother", sb.toString());
sb = new StringBuilder("defdefdefc");
Dictionary.applyMappings(fst, sb);
table.applyMappings(sb);
assertEquals("ghghghde", sb.toString());
}