diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ConvTable.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ConvTable.java new file mode 100644 index 00000000000..6a87167e7e1 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ConvTable.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +import java.io.IOException; +import java.util.Map; +import java.util.TreeMap; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IntsRefBuilder; +import org.apache.lucene.util.fst.CharSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTCompiler; +import org.apache.lucene.util.fst.Outputs; +import org.apache.lucene.util.fst.Util; + +/** ICONV or OCONV replacement table */ +class ConvTable { + private final FST fst; + private final FixedBitSet firstCharHashes; + private final int mod; + + ConvTable(TreeMap mappings) { + mod = Math.max(256, Integer.highestOneBit(mappings.size()) << 1); + firstCharHashes = new FixedBitSet(mod); + + try { + Outputs outputs = CharSequenceOutputs.getSingleton(); + FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs); + IntsRefBuilder scratchInts = new IntsRefBuilder(); + for (Map.Entry entry : mappings.entrySet()) { + String key = entry.getKey(); + assert key.length() > 0; + firstCharHashes.set(key.charAt(0) % mod); + Util.toUTF16(key, scratchInts); + fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue())); + } + + fst = fstCompiler.compile(); + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + } + + void applyMappings(StringBuilder sb) { + FST.BytesReader bytesReader = null; + FST.Arc firstArc = null; + FST.Arc arc = null; + + int longestMatch; + CharsRef longestOutput; + + for (int i = 0; i < sb.length(); i++) { + if (!mightReplaceChar(sb.charAt(i))) { + continue; + } + + if (firstArc == null) { + firstArc = fst.getFirstArc(new FST.Arc<>()); + bytesReader = fst.getBytesReader(); + arc = new FST.Arc<>(); + } + arc.copyFrom(firstArc); + CharsRef output = fst.outputs.getNoOutput(); + longestMatch = -1; + longestOutput = null; + + for (int j = i; j < sb.length(); j++) { + char ch = sb.charAt(j); + + try { + if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) { + break; + } + output = fst.outputs.add(output, arc.output()); + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + if (arc.isFinal()) { + longestOutput = fst.outputs.add(output, arc.nextFinalOutput()); + longestMatch = j; + } + } + + if (longestMatch >= 0) { + sb.delete(i, longestMatch + 1); + sb.insert(i, longestOutput); + i += (longestOutput.length - 1); + } + } + } + + boolean mightReplaceChar(char c) { + return firstCharHashes.get(c % mod); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index b65d287621d..d22c6eb8832 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -51,7 +51,6 @@ import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRefBuilder; @@ -60,11 +59,9 @@ import org.apache.lucene.util.OfflineSorter.ByteSequencesReader; import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.RegExp; -import org.apache.lucene.util.fst.CharSequenceOutputs; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.IntSequenceOutputs; -import org.apache.lucene.util.fst.Outputs; import org.apache.lucene.util.fst.Util; /** In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary. */ @@ -172,13 +169,7 @@ public class Dictionary { int maxNGramSuggestions = Integer.MAX_VALUE; boolean onlyMaxDiff; char noSuggest, subStandard; - - // FSTs used for ICONV/OCONV, output ord pointing to replacement text - FST iconv; - FST oconv; - - boolean needsInputCleaning; - boolean needsOutputCleaning; + ConvTable iconv, oconv; // true if we can strip suffixes "down to nothing" boolean fullStrip; @@ -224,8 +215,6 @@ public class Dictionary { boolean ignoreCase) throws IOException, ParseException { this.ignoreCase = ignoreCase; - this.needsInputCleaning = ignoreCase; - this.needsOutputCleaning = false; // set if we have an OCONV try (BufferedInputStream affixStream = new BufferedInputStream(affix, MAX_PROLOGUE_SCAN_WINDOW) { @@ -379,16 +368,13 @@ public class Dictionary { } else if ("IGNORE".equals(firstWord)) { ignore = singleArgument(reader, line).toCharArray(); Arrays.sort(ignore); - needsInputCleaning = true; } else if ("ICONV".equals(firstWord) || "OCONV".equals(firstWord)) { int num = parseNum(reader, line); - FST res = parseConversions(reader, num); + ConvTable res = parseConversions(reader, num); if (line.startsWith("I")) { iconv = res; - needsInputCleaning |= iconv != null; } else { oconv = res; - needsOutputCleaning |= oconv != null; } } else if ("FULLSTRIP".equals(firstWord)) { fullStrip = true; @@ -803,9 +789,8 @@ public class Dictionary { affixData[dataStart + AFFIX_CONDITION] = (char) patternOrd; affixData[dataStart + AFFIX_APPEND] = (char) appendFlagsOrd; - if (needsInputCleaning) { - CharSequence cleaned = cleanInput(affixArg, sb); - affixArg = cleaned.toString(); + if (needsInputCleaning(affixArg)) { + affixArg = cleanInput(affixArg, sb).toString(); } if (isSuffix) { @@ -840,9 +825,9 @@ public class Dictionary { return affixData(affix, AFFIX_CONDITION) >>> 1; } - private FST parseConversions(LineNumberReader reader, int num) + private ConvTable parseConversions(LineNumberReader reader, int num) throws IOException, ParseException { - Map mappings = new TreeMap<>(); + TreeMap mappings = new TreeMap<>(); for (int i = 0; i < num; i++) { String[] parts = splitBySpace(reader, reader.readLine(), 3); @@ -851,15 +836,7 @@ public class Dictionary { } } - Outputs outputs = CharSequenceOutputs.getSingleton(); - FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs); - IntsRefBuilder scratchInts = new IntsRefBuilder(); - for (Map.Entry entry : mappings.entrySet()) { - Util.toUTF16(entry.getKey(), scratchInts); - fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue())); - } - - return fstCompiler.compile(); + return new ConvTable(mappings); } private static final byte[] BOM_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf}; @@ -1085,8 +1062,9 @@ public class Dictionary { int sep = flagSep < 0 ? morphSep : flagSep; CharSequence toWrite; - if (needsInputCleaning) { - cleanInput(line, sep, reuse); + String beforeSep = line.substring(0, sep); + if (needsInputCleaning(beforeSep)) { + cleanInput(beforeSep, reuse); reuse.append(line, sep, line.length()); toWrite = reuse; } else { @@ -1571,14 +1549,28 @@ public class Dictionary { return flagLookup.hasFlag(entryId, flag); } - CharSequence cleanInput(CharSequence input, StringBuilder reuse) { - return cleanInput(input, input.length(), reuse); + boolean mayNeedInputCleaning() { + return ignoreCase || ignore != null || iconv != null; } - private CharSequence cleanInput(CharSequence input, int prefixLength, StringBuilder reuse) { + boolean needsInputCleaning(CharSequence input) { + if (mayNeedInputCleaning()) { + for (int i = 0; i < input.length(); i++) { + char ch = input.charAt(i); + if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0 + || ignoreCase && caseFold(ch) != ch + || iconv != null && iconv.mightReplaceChar(ch)) { + return true; + } + } + } + return false; + } + + CharSequence cleanInput(CharSequence input, StringBuilder reuse) { reuse.setLength(0); - for (int i = 0; i < prefixLength; i++) { + for (int i = 0; i < input.length(); i++) { char ch = input.charAt(i); if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) { @@ -1594,11 +1586,7 @@ public class Dictionary { } if (iconv != null) { - try { - applyMappings(iconv, reuse); - } catch (IOException bogus) { - throw new RuntimeException(bogus); - } + iconv.applyMappings(reuse); if (ignoreCase) { for (int i = 0; i < reuse.length(); i++) { reuse.setCharAt(i, caseFold(reuse.charAt(i))); @@ -1624,44 +1612,6 @@ public class Dictionary { } } - // TODO: this could be more efficient! - static void applyMappings(FST fst, StringBuilder sb) throws IOException { - final FST.BytesReader bytesReader = fst.getBytesReader(); - final FST.Arc firstArc = fst.getFirstArc(new FST.Arc<>()); - final CharsRef NO_OUTPUT = fst.outputs.getNoOutput(); - - // temporary stuff - final FST.Arc arc = new FST.Arc<>(); - int longestMatch; - CharsRef longestOutput; - - for (int i = 0; i < sb.length(); i++) { - arc.copyFrom(firstArc); - CharsRef output = NO_OUTPUT; - longestMatch = -1; - longestOutput = null; - - for (int j = i; j < sb.length(); j++) { - char ch = sb.charAt(j); - if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) { - break; - } else { - output = fst.outputs.add(output, arc.output()); - } - if (arc.isFinal()) { - longestOutput = fst.outputs.add(output, arc.nextFinalOutput()); - longestMatch = j; - } - } - - if (longestMatch >= 0) { - sb.delete(i, longestMatch + 1); - sb.insert(i, longestOutput); - i += (longestOutput.length - 1); - } - } - } - /** Returns true if this dictionary was constructed with the {@code ignoreCase} option */ public boolean getIgnoreCase() { return ignoreCase; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java index e85494b57d9..082076a57c3 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java @@ -22,7 +22,6 @@ import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END; import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE; import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD; -import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.LinkedHashSet; @@ -72,7 +71,7 @@ public class Hunspell { checkCanceled.run(); if (word.isEmpty()) return true; - if (dictionary.needsInputCleaning) { + if (dictionary.needsInputCleaning(word)) { word = dictionary.cleanInput(word, new StringBuilder()).toString(); } @@ -479,7 +478,7 @@ public class Hunspell { checkCanceled.run(); if (word.length() >= 100) return Collections.emptyList(); - if (dictionary.needsInputCleaning) { + if (dictionary.needsInputCleaning(word)) { word = dictionary.cleanInput(word, new StringBuilder()).toString(); } @@ -565,14 +564,10 @@ public class Hunspell { } private String cleanOutput(String s) { - if (!dictionary.needsOutputCleaning) return s; + if (dictionary.oconv == null) return s; - try { - StringBuilder sb = new StringBuilder(s); - Dictionary.applyMappings(dictionary.oconv, sb); - return sb.toString(); - } catch (IOException bogus) { - throw new RuntimeException(bogus); - } + StringBuilder sb = new StringBuilder(s); + dictionary.oconv.applyMappings(sb); + return sb.toString(); } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index ceb47b2bd98..012b7641fe7 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -83,14 +83,16 @@ final class Stemmer { */ public List stem(char[] word, int length) { - if (dictionary.needsInputCleaning) { + if (dictionary.mayNeedInputCleaning()) { scratchSegment.setLength(0); scratchSegment.append(word, 0, length); - CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment); - scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length()); - length = segment.length(); - segment.getChars(0, length, scratchBuffer, 0); - word = scratchBuffer; + if (dictionary.needsInputCleaning(scratchSegment)) { + CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment); + scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length()); + length = segment.length(); + segment.getChars(0, length, scratchBuffer, 0); + word = scratchBuffer; + } } List list = new ArrayList<>(); @@ -365,18 +367,14 @@ final class Stemmer { private CharsRef newStem(CharsRef stem, int morphDataId) { String exception = stemException(morphDataId); - if (dictionary.needsOutputCleaning) { + if (dictionary.oconv != null) { scratchSegment.setLength(0); if (exception != null) { scratchSegment.append(exception); } else { scratchSegment.append(stem.chars, stem.offset, stem.length); } - try { - Dictionary.applyMappings(dictionary.oconv, scratchSegment); - } catch (IOException bogus) { - throw new RuntimeException(bogus); - } + dictionary.oconv.applyMappings(scratchSegment); char[] cleaned = new char[scratchSegment.length()]; scratchSegment.getChars(0, cleaned.length, cleaned, 0); return new CharsRef(cleaned, 0, cleaned.length); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java index 2cc0c8495ac..1b64c3a9d31 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java @@ -24,19 +24,13 @@ import java.nio.charset.StandardCharsets; import java.text.ParseException; import java.util.Arrays; import java.util.Collections; +import java.util.TreeMap; import java.util.stream.Collectors; import java.util.stream.IntStream; import org.apache.lucene.store.ByteBuffersDirectory; import org.apache.lucene.store.Directory; -import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.fst.CharSequenceOutputs; -import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.FSTCompiler; -import org.apache.lucene.util.fst.Outputs; -import org.apache.lucene.util.fst.Util; import org.junit.Test; public class TestDictionary extends LuceneTestCase { @@ -166,51 +160,36 @@ public class TestDictionary extends LuceneTestCase { assertTrue(dictStream.isClosed()); } - public void testReplacements() throws Exception { - Outputs outputs = CharSequenceOutputs.getSingleton(); - FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs); - IntsRefBuilder scratchInts = new IntsRefBuilder(); - - // a -> b - Util.toUTF16("a", scratchInts); - fstCompiler.add(scratchInts.get(), new CharsRef("b")); - - // ab -> c - Util.toUTF16("ab", scratchInts); - fstCompiler.add(scratchInts.get(), new CharsRef("c")); - - // c -> de - Util.toUTF16("c", scratchInts); - fstCompiler.add(scratchInts.get(), new CharsRef("de")); - - // def -> gh - Util.toUTF16("def", scratchInts); - fstCompiler.add(scratchInts.get(), new CharsRef("gh")); - - FST fst = fstCompiler.compile(); + public void testReplacements() { + TreeMap map = new TreeMap<>(); + map.put("a", "b"); + map.put("ab", "c"); + map.put("c", "de"); + map.put("def", "gh"); + ConvTable table = new ConvTable(map); StringBuilder sb = new StringBuilder("atestanother"); - Dictionary.applyMappings(fst, sb); + table.applyMappings(sb); assertEquals("btestbnother", sb.toString()); sb = new StringBuilder("abtestanother"); - Dictionary.applyMappings(fst, sb); + table.applyMappings(sb); assertEquals("ctestbnother", sb.toString()); sb = new StringBuilder("atestabnother"); - Dictionary.applyMappings(fst, sb); + table.applyMappings(sb); assertEquals("btestcnother", sb.toString()); sb = new StringBuilder("abtestabnother"); - Dictionary.applyMappings(fst, sb); + table.applyMappings(sb); assertEquals("ctestcnother", sb.toString()); sb = new StringBuilder("abtestabcnother"); - Dictionary.applyMappings(fst, sb); + table.applyMappings(sb); assertEquals("ctestcdenother", sb.toString()); sb = new StringBuilder("defdefdefc"); - Dictionary.applyMappings(fst, sb); + table.applyMappings(sb); assertEquals("ghghghde", sb.toString()); }