mirror of https://github.com/apache/lucene.git
LUCENE-9778: Hunspell: speed up input conversion (#2376)
This commit is contained in:
parent
2d53c6073b
commit
2ae45cc985
|
@ -0,0 +1,110 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.fst.CharSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.Outputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
/** ICONV or OCONV replacement table */
|
||||
class ConvTable {
|
||||
private final FST<CharsRef> fst;
|
||||
private final FixedBitSet firstCharHashes;
|
||||
private final int mod;
|
||||
|
||||
ConvTable(TreeMap<String, String> mappings) {
|
||||
mod = Math.max(256, Integer.highestOneBit(mappings.size()) << 1);
|
||||
firstCharHashes = new FixedBitSet(mod);
|
||||
|
||||
try {
|
||||
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
||||
FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
|
||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||
for (Map.Entry<String, String> entry : mappings.entrySet()) {
|
||||
String key = entry.getKey();
|
||||
assert key.length() > 0;
|
||||
firstCharHashes.set(key.charAt(0) % mod);
|
||||
Util.toUTF16(key, scratchInts);
|
||||
fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
|
||||
}
|
||||
|
||||
fst = fstCompiler.compile();
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
}
|
||||
|
||||
void applyMappings(StringBuilder sb) {
|
||||
FST.BytesReader bytesReader = null;
|
||||
FST.Arc<CharsRef> firstArc = null;
|
||||
FST.Arc<CharsRef> arc = null;
|
||||
|
||||
int longestMatch;
|
||||
CharsRef longestOutput;
|
||||
|
||||
for (int i = 0; i < sb.length(); i++) {
|
||||
if (!mightReplaceChar(sb.charAt(i))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (firstArc == null) {
|
||||
firstArc = fst.getFirstArc(new FST.Arc<>());
|
||||
bytesReader = fst.getBytesReader();
|
||||
arc = new FST.Arc<>();
|
||||
}
|
||||
arc.copyFrom(firstArc);
|
||||
CharsRef output = fst.outputs.getNoOutput();
|
||||
longestMatch = -1;
|
||||
longestOutput = null;
|
||||
|
||||
for (int j = i; j < sb.length(); j++) {
|
||||
char ch = sb.charAt(j);
|
||||
|
||||
try {
|
||||
if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
|
||||
break;
|
||||
}
|
||||
output = fst.outputs.add(output, arc.output());
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
if (arc.isFinal()) {
|
||||
longestOutput = fst.outputs.add(output, arc.nextFinalOutput());
|
||||
longestMatch = j;
|
||||
}
|
||||
}
|
||||
|
||||
if (longestMatch >= 0) {
|
||||
sb.delete(i, longestMatch + 1);
|
||||
sb.insert(i, longestOutput);
|
||||
i += (longestOutput.length - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
boolean mightReplaceChar(char c) {
|
||||
return firstCharHashes.get(c % mod);
|
||||
}
|
||||
}
|
|
@ -51,7 +51,6 @@ import org.apache.lucene.store.IOContext;
|
|||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
|
@ -60,11 +59,9 @@ import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
|
|||
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
import org.apache.lucene.util.fst.CharSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.IntSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.Outputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
/** In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary. */
|
||||
|
@ -172,13 +169,7 @@ public class Dictionary {
|
|||
int maxNGramSuggestions = Integer.MAX_VALUE;
|
||||
boolean onlyMaxDiff;
|
||||
char noSuggest, subStandard;
|
||||
|
||||
// FSTs used for ICONV/OCONV, output ord pointing to replacement text
|
||||
FST<CharsRef> iconv;
|
||||
FST<CharsRef> oconv;
|
||||
|
||||
boolean needsInputCleaning;
|
||||
boolean needsOutputCleaning;
|
||||
ConvTable iconv, oconv;
|
||||
|
||||
// true if we can strip suffixes "down to nothing"
|
||||
boolean fullStrip;
|
||||
|
@ -224,8 +215,6 @@ public class Dictionary {
|
|||
boolean ignoreCase)
|
||||
throws IOException, ParseException {
|
||||
this.ignoreCase = ignoreCase;
|
||||
this.needsInputCleaning = ignoreCase;
|
||||
this.needsOutputCleaning = false; // set if we have an OCONV
|
||||
|
||||
try (BufferedInputStream affixStream =
|
||||
new BufferedInputStream(affix, MAX_PROLOGUE_SCAN_WINDOW) {
|
||||
|
@ -379,16 +368,13 @@ public class Dictionary {
|
|||
} else if ("IGNORE".equals(firstWord)) {
|
||||
ignore = singleArgument(reader, line).toCharArray();
|
||||
Arrays.sort(ignore);
|
||||
needsInputCleaning = true;
|
||||
} else if ("ICONV".equals(firstWord) || "OCONV".equals(firstWord)) {
|
||||
int num = parseNum(reader, line);
|
||||
FST<CharsRef> res = parseConversions(reader, num);
|
||||
ConvTable res = parseConversions(reader, num);
|
||||
if (line.startsWith("I")) {
|
||||
iconv = res;
|
||||
needsInputCleaning |= iconv != null;
|
||||
} else {
|
||||
oconv = res;
|
||||
needsOutputCleaning |= oconv != null;
|
||||
}
|
||||
} else if ("FULLSTRIP".equals(firstWord)) {
|
||||
fullStrip = true;
|
||||
|
@ -803,9 +789,8 @@ public class Dictionary {
|
|||
affixData[dataStart + AFFIX_CONDITION] = (char) patternOrd;
|
||||
affixData[dataStart + AFFIX_APPEND] = (char) appendFlagsOrd;
|
||||
|
||||
if (needsInputCleaning) {
|
||||
CharSequence cleaned = cleanInput(affixArg, sb);
|
||||
affixArg = cleaned.toString();
|
||||
if (needsInputCleaning(affixArg)) {
|
||||
affixArg = cleanInput(affixArg, sb).toString();
|
||||
}
|
||||
|
||||
if (isSuffix) {
|
||||
|
@ -840,9 +825,9 @@ public class Dictionary {
|
|||
return affixData(affix, AFFIX_CONDITION) >>> 1;
|
||||
}
|
||||
|
||||
private FST<CharsRef> parseConversions(LineNumberReader reader, int num)
|
||||
private ConvTable parseConversions(LineNumberReader reader, int num)
|
||||
throws IOException, ParseException {
|
||||
Map<String, String> mappings = new TreeMap<>();
|
||||
TreeMap<String, String> mappings = new TreeMap<>();
|
||||
|
||||
for (int i = 0; i < num; i++) {
|
||||
String[] parts = splitBySpace(reader, reader.readLine(), 3);
|
||||
|
@ -851,15 +836,7 @@ public class Dictionary {
|
|||
}
|
||||
}
|
||||
|
||||
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
||||
FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
|
||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||
for (Map.Entry<String, String> entry : mappings.entrySet()) {
|
||||
Util.toUTF16(entry.getKey(), scratchInts);
|
||||
fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
|
||||
}
|
||||
|
||||
return fstCompiler.compile();
|
||||
return new ConvTable(mappings);
|
||||
}
|
||||
|
||||
private static final byte[] BOM_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};
|
||||
|
@ -1085,8 +1062,9 @@ public class Dictionary {
|
|||
int sep = flagSep < 0 ? morphSep : flagSep;
|
||||
|
||||
CharSequence toWrite;
|
||||
if (needsInputCleaning) {
|
||||
cleanInput(line, sep, reuse);
|
||||
String beforeSep = line.substring(0, sep);
|
||||
if (needsInputCleaning(beforeSep)) {
|
||||
cleanInput(beforeSep, reuse);
|
||||
reuse.append(line, sep, line.length());
|
||||
toWrite = reuse;
|
||||
} else {
|
||||
|
@ -1571,14 +1549,28 @@ public class Dictionary {
|
|||
return flagLookup.hasFlag(entryId, flag);
|
||||
}
|
||||
|
||||
CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
|
||||
return cleanInput(input, input.length(), reuse);
|
||||
boolean mayNeedInputCleaning() {
|
||||
return ignoreCase || ignore != null || iconv != null;
|
||||
}
|
||||
|
||||
private CharSequence cleanInput(CharSequence input, int prefixLength, StringBuilder reuse) {
|
||||
boolean needsInputCleaning(CharSequence input) {
|
||||
if (mayNeedInputCleaning()) {
|
||||
for (int i = 0; i < input.length(); i++) {
|
||||
char ch = input.charAt(i);
|
||||
if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0
|
||||
|| ignoreCase && caseFold(ch) != ch
|
||||
|| iconv != null && iconv.mightReplaceChar(ch)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
|
||||
reuse.setLength(0);
|
||||
|
||||
for (int i = 0; i < prefixLength; i++) {
|
||||
for (int i = 0; i < input.length(); i++) {
|
||||
char ch = input.charAt(i);
|
||||
|
||||
if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) {
|
||||
|
@ -1594,11 +1586,7 @@ public class Dictionary {
|
|||
}
|
||||
|
||||
if (iconv != null) {
|
||||
try {
|
||||
applyMappings(iconv, reuse);
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
iconv.applyMappings(reuse);
|
||||
if (ignoreCase) {
|
||||
for (int i = 0; i < reuse.length(); i++) {
|
||||
reuse.setCharAt(i, caseFold(reuse.charAt(i)));
|
||||
|
@ -1624,44 +1612,6 @@ public class Dictionary {
|
|||
}
|
||||
}
|
||||
|
||||
// TODO: this could be more efficient!
|
||||
static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
|
||||
final FST.BytesReader bytesReader = fst.getBytesReader();
|
||||
final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<>());
|
||||
final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();
|
||||
|
||||
// temporary stuff
|
||||
final FST.Arc<CharsRef> arc = new FST.Arc<>();
|
||||
int longestMatch;
|
||||
CharsRef longestOutput;
|
||||
|
||||
for (int i = 0; i < sb.length(); i++) {
|
||||
arc.copyFrom(firstArc);
|
||||
CharsRef output = NO_OUTPUT;
|
||||
longestMatch = -1;
|
||||
longestOutput = null;
|
||||
|
||||
for (int j = i; j < sb.length(); j++) {
|
||||
char ch = sb.charAt(j);
|
||||
if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
|
||||
break;
|
||||
} else {
|
||||
output = fst.outputs.add(output, arc.output());
|
||||
}
|
||||
if (arc.isFinal()) {
|
||||
longestOutput = fst.outputs.add(output, arc.nextFinalOutput());
|
||||
longestMatch = j;
|
||||
}
|
||||
}
|
||||
|
||||
if (longestMatch >= 0) {
|
||||
sb.delete(i, longestMatch + 1);
|
||||
sb.insert(i, longestOutput);
|
||||
i += (longestOutput.length - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns true if this dictionary was constructed with the {@code ignoreCase} option */
|
||||
public boolean getIgnoreCase() {
|
||||
return ignoreCase;
|
||||
|
|
|
@ -22,7 +22,6 @@ import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
|
|||
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
|
||||
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedHashSet;
|
||||
|
@ -72,7 +71,7 @@ public class Hunspell {
|
|||
checkCanceled.run();
|
||||
if (word.isEmpty()) return true;
|
||||
|
||||
if (dictionary.needsInputCleaning) {
|
||||
if (dictionary.needsInputCleaning(word)) {
|
||||
word = dictionary.cleanInput(word, new StringBuilder()).toString();
|
||||
}
|
||||
|
||||
|
@ -479,7 +478,7 @@ public class Hunspell {
|
|||
checkCanceled.run();
|
||||
if (word.length() >= 100) return Collections.emptyList();
|
||||
|
||||
if (dictionary.needsInputCleaning) {
|
||||
if (dictionary.needsInputCleaning(word)) {
|
||||
word = dictionary.cleanInput(word, new StringBuilder()).toString();
|
||||
}
|
||||
|
||||
|
@ -565,14 +564,10 @@ public class Hunspell {
|
|||
}
|
||||
|
||||
private String cleanOutput(String s) {
|
||||
if (!dictionary.needsOutputCleaning) return s;
|
||||
if (dictionary.oconv == null) return s;
|
||||
|
||||
try {
|
||||
StringBuilder sb = new StringBuilder(s);
|
||||
Dictionary.applyMappings(dictionary.oconv, sb);
|
||||
dictionary.oconv.applyMappings(sb);
|
||||
return sb.toString();
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -83,15 +83,17 @@ final class Stemmer {
|
|||
*/
|
||||
public List<CharsRef> stem(char[] word, int length) {
|
||||
|
||||
if (dictionary.needsInputCleaning) {
|
||||
if (dictionary.mayNeedInputCleaning()) {
|
||||
scratchSegment.setLength(0);
|
||||
scratchSegment.append(word, 0, length);
|
||||
if (dictionary.needsInputCleaning(scratchSegment)) {
|
||||
CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment);
|
||||
scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length());
|
||||
length = segment.length();
|
||||
segment.getChars(0, length, scratchBuffer, 0);
|
||||
word = scratchBuffer;
|
||||
}
|
||||
}
|
||||
|
||||
List<CharsRef> list = new ArrayList<>();
|
||||
RootProcessor processor =
|
||||
|
@ -365,18 +367,14 @@ final class Stemmer {
|
|||
private CharsRef newStem(CharsRef stem, int morphDataId) {
|
||||
String exception = stemException(morphDataId);
|
||||
|
||||
if (dictionary.needsOutputCleaning) {
|
||||
if (dictionary.oconv != null) {
|
||||
scratchSegment.setLength(0);
|
||||
if (exception != null) {
|
||||
scratchSegment.append(exception);
|
||||
} else {
|
||||
scratchSegment.append(stem.chars, stem.offset, stem.length);
|
||||
}
|
||||
try {
|
||||
Dictionary.applyMappings(dictionary.oconv, scratchSegment);
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
dictionary.oconv.applyMappings(scratchSegment);
|
||||
char[] cleaned = new char[scratchSegment.length()];
|
||||
scratchSegment.getChars(0, cleaned.length, cleaned, 0);
|
||||
return new CharsRef(cleaned, 0, cleaned.length);
|
||||
|
|
|
@ -24,19 +24,13 @@ import java.nio.charset.StandardCharsets;
|
|||
import java.text.ParseException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
import org.apache.lucene.store.ByteBuffersDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.fst.CharSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.Outputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestDictionary extends LuceneTestCase {
|
||||
|
@ -166,51 +160,36 @@ public class TestDictionary extends LuceneTestCase {
|
|||
assertTrue(dictStream.isClosed());
|
||||
}
|
||||
|
||||
public void testReplacements() throws Exception {
|
||||
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
||||
FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
|
||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||
|
||||
// a -> b
|
||||
Util.toUTF16("a", scratchInts);
|
||||
fstCompiler.add(scratchInts.get(), new CharsRef("b"));
|
||||
|
||||
// ab -> c
|
||||
Util.toUTF16("ab", scratchInts);
|
||||
fstCompiler.add(scratchInts.get(), new CharsRef("c"));
|
||||
|
||||
// c -> de
|
||||
Util.toUTF16("c", scratchInts);
|
||||
fstCompiler.add(scratchInts.get(), new CharsRef("de"));
|
||||
|
||||
// def -> gh
|
||||
Util.toUTF16("def", scratchInts);
|
||||
fstCompiler.add(scratchInts.get(), new CharsRef("gh"));
|
||||
|
||||
FST<CharsRef> fst = fstCompiler.compile();
|
||||
public void testReplacements() {
|
||||
TreeMap<String, String> map = new TreeMap<>();
|
||||
map.put("a", "b");
|
||||
map.put("ab", "c");
|
||||
map.put("c", "de");
|
||||
map.put("def", "gh");
|
||||
ConvTable table = new ConvTable(map);
|
||||
|
||||
StringBuilder sb = new StringBuilder("atestanother");
|
||||
Dictionary.applyMappings(fst, sb);
|
||||
table.applyMappings(sb);
|
||||
assertEquals("btestbnother", sb.toString());
|
||||
|
||||
sb = new StringBuilder("abtestanother");
|
||||
Dictionary.applyMappings(fst, sb);
|
||||
table.applyMappings(sb);
|
||||
assertEquals("ctestbnother", sb.toString());
|
||||
|
||||
sb = new StringBuilder("atestabnother");
|
||||
Dictionary.applyMappings(fst, sb);
|
||||
table.applyMappings(sb);
|
||||
assertEquals("btestcnother", sb.toString());
|
||||
|
||||
sb = new StringBuilder("abtestabnother");
|
||||
Dictionary.applyMappings(fst, sb);
|
||||
table.applyMappings(sb);
|
||||
assertEquals("ctestcnother", sb.toString());
|
||||
|
||||
sb = new StringBuilder("abtestabcnother");
|
||||
Dictionary.applyMappings(fst, sb);
|
||||
table.applyMappings(sb);
|
||||
assertEquals("ctestcdenother", sb.toString());
|
||||
|
||||
sb = new StringBuilder("defdefdefc");
|
||||
Dictionary.applyMappings(fst, sb);
|
||||
table.applyMappings(sb);
|
||||
assertEquals("ghghghde", sb.toString());
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue