LUCENE-9778: Hunspell: speed up input conversion (#2376)

2021-02-17 09:10:40 +01:00 · 2021-02-17 09:10:40 +01:00 · 2ae45cc985
parent 2d53c6073b
commit 2ae45cc985
5 changed files with 169 additions and 137 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ConvTable.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ConvTable.java
@ -0,0 +1,110 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.hunspell;
 import java.io.IOException;
 import java.util.Map;
 import java.util.TreeMap;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.fst.CharSequenceOutputs;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.FSTCompiler;
 import org.apache.lucene.util.fst.Outputs;
 import org.apache.lucene.util.fst.Util;
 /** ICONV or OCONV replacement table */
 class ConvTable {
  private final FST<CharsRef> fst;
  private final FixedBitSet firstCharHashes;
  private final int mod;
  ConvTable(TreeMap<String, String> mappings) {
    mod = Math.max(256, Integer.highestOneBit(mappings.size()) << 1);
    firstCharHashes = new FixedBitSet(mod);
    try {
      Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
      FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
      IntsRefBuilder scratchInts = new IntsRefBuilder();
      for (Map.Entry<String, String> entry : mappings.entrySet()) {
        String key = entry.getKey();
        assert key.length() > 0;
        firstCharHashes.set(key.charAt(0) % mod);
        Util.toUTF16(key, scratchInts);
        fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
      }
      fst = fstCompiler.compile();
    } catch (IOException bogus) {
      throw new RuntimeException(bogus);
    }
  }
  void applyMappings(StringBuilder sb) {
    FST.BytesReader bytesReader = null;
    FST.Arc<CharsRef> firstArc = null;
    FST.Arc<CharsRef> arc = null;
    int longestMatch;
    CharsRef longestOutput;
    for (int i = 0; i < sb.length(); i++) {
      if (!mightReplaceChar(sb.charAt(i))) {
        continue;
      }
      if (firstArc == null) {
        firstArc = fst.getFirstArc(new FST.Arc<>());
        bytesReader = fst.getBytesReader();
        arc = new FST.Arc<>();
      }
      arc.copyFrom(firstArc);
      CharsRef output = fst.outputs.getNoOutput();
      longestMatch = -1;
      longestOutput = null;
      for (int j = i; j < sb.length(); j++) {
        char ch = sb.charAt(j);
        try {
          if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
            break;
          }
          output = fst.outputs.add(output, arc.output());
        } catch (IOException bogus) {
          throw new RuntimeException(bogus);
        }
        if (arc.isFinal()) {
          longestOutput = fst.outputs.add(output, arc.nextFinalOutput());
          longestMatch = j;
        }
      }
      if (longestMatch >= 0) {
        sb.delete(i, longestMatch + 1);
        sb.insert(i, longestOutput);
        i += (longestOutput.length - 1);
      }
    }
  }
  boolean mightReplaceChar(char c) {
    return firstCharHashes.get(c % mod);
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -51,7 +51,6 @@ import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.IntsRefBuilder;
@ -60,11 +59,9 @@ import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.apache.lucene.util.automaton.RegExp;
 import org.apache.lucene.util.fst.CharSequenceOutputs;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.FSTCompiler;
 import org.apache.lucene.util.fst.IntSequenceOutputs;
 import org.apache.lucene.util.fst.Outputs;
 import org.apache.lucene.util.fst.Util;
 /** In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary. */
@ -172,13 +169,7 @@ public class Dictionary {
  int maxNGramSuggestions = Integer.MAX_VALUE;
  boolean onlyMaxDiff;
  char noSuggest, subStandard;
-
+  ConvTable iconv, oconv;
  // FSTs used for ICONV/OCONV, output ord pointing to replacement text
  FST<CharsRef> iconv;
  FST<CharsRef> oconv;
  boolean needsInputCleaning;
  boolean needsOutputCleaning;
  // true if we can strip suffixes "down to nothing"
  boolean fullStrip;
@ -224,8 +215,6 @@ public class Dictionary {
      boolean ignoreCase)
      throws IOException, ParseException {
    this.ignoreCase = ignoreCase;
    this.needsInputCleaning = ignoreCase;
    this.needsOutputCleaning = false; // set if we have an OCONV
    try (BufferedInputStream affixStream =
        new BufferedInputStream(affix, MAX_PROLOGUE_SCAN_WINDOW) {
@ -379,16 +368,13 @@ public class Dictionary {
      } else if ("IGNORE".equals(firstWord)) {
        ignore = singleArgument(reader, line).toCharArray();
        Arrays.sort(ignore);
        needsInputCleaning = true;
      } else if ("ICONV".equals(firstWord) || "OCONV".equals(firstWord)) {
        int num = parseNum(reader, line);
-        FST<CharsRef> res = parseConversions(reader, num);
+        ConvTable res = parseConversions(reader, num);
        if (line.startsWith("I")) {
          iconv = res;
          needsInputCleaning |= iconv != null;
        } else {
          oconv = res;
          needsOutputCleaning |= oconv != null;
        }
      } else if ("FULLSTRIP".equals(firstWord)) {
        fullStrip = true;
@ -803,9 +789,8 @@ public class Dictionary {
      affixData[dataStart + AFFIX_CONDITION] = (char) patternOrd;
      affixData[dataStart + AFFIX_APPEND] = (char) appendFlagsOrd;
-      if (needsInputCleaning) {
+      if (needsInputCleaning(affixArg)) {
-        CharSequence cleaned = cleanInput(affixArg, sb);
+        affixArg = cleanInput(affixArg, sb).toString();
        affixArg = cleaned.toString();
      }
      if (isSuffix) {
@ -840,9 +825,9 @@ public class Dictionary {
    return affixData(affix, AFFIX_CONDITION) >>> 1;
  }
-  private FST<CharsRef> parseConversions(LineNumberReader reader, int num)
+  private ConvTable parseConversions(LineNumberReader reader, int num)
      throws IOException, ParseException {
-    Map<String, String> mappings = new TreeMap<>();
+    TreeMap<String, String> mappings = new TreeMap<>();
    for (int i = 0; i < num; i++) {
      String[] parts = splitBySpace(reader, reader.readLine(), 3);
@ -851,15 +836,7 @@ public class Dictionary {
      }
    }
-    Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
+    return new ConvTable(mappings);
    FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    for (Map.Entry<String, String> entry : mappings.entrySet()) {
      Util.toUTF16(entry.getKey(), scratchInts);
      fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
    }
    return fstCompiler.compile();
  }
  private static final byte[] BOM_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};
@ -1085,8 +1062,9 @@ public class Dictionary {
    int sep = flagSep < 0 ? morphSep : flagSep;
    CharSequence toWrite;
-    if (needsInputCleaning) {
+    String beforeSep = line.substring(0, sep);
-      cleanInput(line, sep, reuse);
+    if (needsInputCleaning(beforeSep)) {
      cleanInput(beforeSep, reuse);
      reuse.append(line, sep, line.length());
      toWrite = reuse;
    } else {
@ -1571,14 +1549,28 @@ public class Dictionary {
    return flagLookup.hasFlag(entryId, flag);
  }
-  CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
+  boolean mayNeedInputCleaning() {
-    return cleanInput(input, input.length(), reuse);
+    return ignoreCase || ignore != null || iconv != null;
  }
-  private CharSequence cleanInput(CharSequence input, int prefixLength, StringBuilder reuse) {
+  boolean needsInputCleaning(CharSequence input) {
    if (mayNeedInputCleaning()) {
      for (int i = 0; i < input.length(); i++) {
        char ch = input.charAt(i);
        if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0
            || ignoreCase && caseFold(ch) != ch
            || iconv != null && iconv.mightReplaceChar(ch)) {
          return true;
        }
      }
    }
    return false;
  }
  CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
    reuse.setLength(0);
-    for (int i = 0; i < prefixLength; i++) {
+    for (int i = 0; i < input.length(); i++) {
      char ch = input.charAt(i);
      if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) {
@ -1594,11 +1586,7 @@ public class Dictionary {
    }
    if (iconv != null) {
-      try {
+      iconv.applyMappings(reuse);
        applyMappings(iconv, reuse);
      } catch (IOException bogus) {
        throw new RuntimeException(bogus);
      }
      if (ignoreCase) {
        for (int i = 0; i < reuse.length(); i++) {
          reuse.setCharAt(i, caseFold(reuse.charAt(i)));
@ -1624,44 +1612,6 @@ public class Dictionary {
    }
  }
  // TODO: this could be more efficient!
  static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
    final FST.BytesReader bytesReader = fst.getBytesReader();
    final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<>());
    final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();
    // temporary stuff
    final FST.Arc<CharsRef> arc = new FST.Arc<>();
    int longestMatch;
    CharsRef longestOutput;
    for (int i = 0; i < sb.length(); i++) {
      arc.copyFrom(firstArc);
      CharsRef output = NO_OUTPUT;
      longestMatch = -1;
      longestOutput = null;
      for (int j = i; j < sb.length(); j++) {
        char ch = sb.charAt(j);
        if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
          break;
        } else {
          output = fst.outputs.add(output, arc.output());
        }
        if (arc.isFinal()) {
          longestOutput = fst.outputs.add(output, arc.nextFinalOutput());
          longestMatch = j;
        }
      }
      if (longestMatch >= 0) {
        sb.delete(i, longestMatch + 1);
        sb.insert(i, longestOutput);
        i += (longestOutput.length - 1);
      }
    }
  }
  /** Returns true if this dictionary was constructed with the {@code ignoreCase} option */
  public boolean getIgnoreCase() {
    return ignoreCase;
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
@ -22,7 +22,6 @@ import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
 import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
 import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.LinkedHashSet;
@ -72,7 +71,7 @@ public class Hunspell {
    checkCanceled.run();
    if (word.isEmpty()) return true;
-    if (dictionary.needsInputCleaning) {
+    if (dictionary.needsInputCleaning(word)) {
      word = dictionary.cleanInput(word, new StringBuilder()).toString();
    }
@ -479,7 +478,7 @@ public class Hunspell {
    checkCanceled.run();
    if (word.length() >= 100) return Collections.emptyList();
-    if (dictionary.needsInputCleaning) {
+    if (dictionary.needsInputCleaning(word)) {
      word = dictionary.cleanInput(word, new StringBuilder()).toString();
    }
@ -565,14 +564,10 @@ public class Hunspell {
  }
  private String cleanOutput(String s) {
-    if (!dictionary.needsOutputCleaning) return s;
+    if (dictionary.oconv == null) return s;
-    try {
+    StringBuilder sb = new StringBuilder(s);
-      StringBuilder sb = new StringBuilder(s);
+    dictionary.oconv.applyMappings(sb);
-      Dictionary.applyMappings(dictionary.oconv, sb);
+    return sb.toString();
      return sb.toString();
    } catch (IOException bogus) {
      throw new RuntimeException(bogus);
    }
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -83,14 +83,16 @@ final class Stemmer {
   */
  public List<CharsRef> stem(char[] word, int length) {
-    if (dictionary.needsInputCleaning) {
+    if (dictionary.mayNeedInputCleaning()) {
      scratchSegment.setLength(0);
      scratchSegment.append(word, 0, length);
-      CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment);
+      if (dictionary.needsInputCleaning(scratchSegment)) {
-      scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length());
+        CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment);
-      length = segment.length();
+        scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length());
-      segment.getChars(0, length, scratchBuffer, 0);
+        length = segment.length();
-      word = scratchBuffer;
+        segment.getChars(0, length, scratchBuffer, 0);
        word = scratchBuffer;
      }
    }
    List<CharsRef> list = new ArrayList<>();
@ -365,18 +367,14 @@ final class Stemmer {
  private CharsRef newStem(CharsRef stem, int morphDataId) {
    String exception = stemException(morphDataId);
-    if (dictionary.needsOutputCleaning) {
+    if (dictionary.oconv != null) {
      scratchSegment.setLength(0);
      if (exception != null) {
        scratchSegment.append(exception);
      } else {
        scratchSegment.append(stem.chars, stem.offset, stem.length);
      }
-      try {
+      dictionary.oconv.applyMappings(scratchSegment);
        Dictionary.applyMappings(dictionary.oconv, scratchSegment);
      } catch (IOException bogus) {
        throw new RuntimeException(bogus);
      }
      char[] cleaned = new char[scratchSegment.length()];
      scratchSegment.getChars(0, cleaned.length, cleaned, 0);
      return new CharsRef(cleaned, 0, cleaned.length);
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@ -24,19 +24,13 @@ import java.nio.charset.StandardCharsets;
 import java.text.ParseException;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.TreeMap;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 import org.apache.lucene.store.ByteBuffersDirectory;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.fst.CharSequenceOutputs;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.FSTCompiler;
 import org.apache.lucene.util.fst.Outputs;
 import org.apache.lucene.util.fst.Util;
 import org.junit.Test;
 public class TestDictionary extends LuceneTestCase {
@ -166,51 +160,36 @@ public class TestDictionary extends LuceneTestCase {
    assertTrue(dictStream.isClosed());
  }
-  public void testReplacements() throws Exception {
+  public void testReplacements() {
-    Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
+    TreeMap<String, String> map = new TreeMap<>();
-    FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
+    map.put("a", "b");
-    IntsRefBuilder scratchInts = new IntsRefBuilder();
+    map.put("ab", "c");
-
+    map.put("c", "de");
-    // a -> b
+    map.put("def", "gh");
-    Util.toUTF16("a", scratchInts);
+    ConvTable table = new ConvTable(map);
    fstCompiler.add(scratchInts.get(), new CharsRef("b"));
    // ab -> c
    Util.toUTF16("ab", scratchInts);
    fstCompiler.add(scratchInts.get(), new CharsRef("c"));
    // c -> de
    Util.toUTF16("c", scratchInts);
    fstCompiler.add(scratchInts.get(), new CharsRef("de"));
    // def -> gh
    Util.toUTF16("def", scratchInts);
    fstCompiler.add(scratchInts.get(), new CharsRef("gh"));
    FST<CharsRef> fst = fstCompiler.compile();
    StringBuilder sb = new StringBuilder("atestanother");
-    Dictionary.applyMappings(fst, sb);
+    table.applyMappings(sb);
    assertEquals("btestbnother", sb.toString());
    sb = new StringBuilder("abtestanother");
-    Dictionary.applyMappings(fst, sb);
+    table.applyMappings(sb);
    assertEquals("ctestbnother", sb.toString());
    sb = new StringBuilder("atestabnother");
-    Dictionary.applyMappings(fst, sb);
+    table.applyMappings(sb);
    assertEquals("btestcnother", sb.toString());
    sb = new StringBuilder("abtestabnother");
-    Dictionary.applyMappings(fst, sb);
+    table.applyMappings(sb);
    assertEquals("ctestcnother", sb.toString());
    sb = new StringBuilder("abtestabcnother");
-    Dictionary.applyMappings(fst, sb);
+    table.applyMappings(sb);
    assertEquals("ctestcdenother", sb.toString());
    sb = new StringBuilder("defdefdefc");
-    Dictionary.applyMappings(fst, sb);
+    table.applyMappings(sb);
    assertEquals("ghghghde", sb.toString());
  }