LUCENE-9778: Hunspell: speed up input conversion (#2376)

2021-02-17 09:10:40 +01:00 · 2021-02-17 09:10:40 +01:00 · 2ae45cc985
parent 2d53c6073b
commit 2ae45cc985
5 changed files with 169 additions and 137 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ConvTable.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ConvTable.java
@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.TreeMap;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.IntsRefBuilder;
+import org.apache.lucene.util.fst.CharSequenceOutputs;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.FSTCompiler;
+import org.apache.lucene.util.fst.Outputs;
+import org.apache.lucene.util.fst.Util;
+
+/** ICONV or OCONV replacement table */
+class ConvTable {
+  private final FST<CharsRef> fst;
+  private final FixedBitSet firstCharHashes;
+  private final int mod;
+
+  ConvTable(TreeMap<String, String> mappings) {
+    mod = Math.max(256, Integer.highestOneBit(mappings.size()) << 1);
+    firstCharHashes = new FixedBitSet(mod);
+
+    try {
+      Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
+      FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
+      IntsRefBuilder scratchInts = new IntsRefBuilder();
+      for (Map.Entry<String, String> entry : mappings.entrySet()) {
+        String key = entry.getKey();
+        assert key.length() > 0;
+        firstCharHashes.set(key.charAt(0) % mod);
+        Util.toUTF16(key, scratchInts);
+        fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
+      }
+
+      fst = fstCompiler.compile();
+    } catch (IOException bogus) {
+      throw new RuntimeException(bogus);
+    }
+  }
+
+  void applyMappings(StringBuilder sb) {
+    FST.BytesReader bytesReader = null;
+    FST.Arc<CharsRef> firstArc = null;
+    FST.Arc<CharsRef> arc = null;
+
+    int longestMatch;
+    CharsRef longestOutput;
+
+    for (int i = 0; i < sb.length(); i++) {
+      if (!mightReplaceChar(sb.charAt(i))) {
+        continue;
+      }
+
+      if (firstArc == null) {
+        firstArc = fst.getFirstArc(new FST.Arc<>());
+        bytesReader = fst.getBytesReader();
+        arc = new FST.Arc<>();
+      }
+      arc.copyFrom(firstArc);
+      CharsRef output = fst.outputs.getNoOutput();
+      longestMatch = -1;
+      longestOutput = null;
+
+      for (int j = i; j < sb.length(); j++) {
+        char ch = sb.charAt(j);
+
+        try {
+          if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
+            break;
+          }
+          output = fst.outputs.add(output, arc.output());
+        } catch (IOException bogus) {
+          throw new RuntimeException(bogus);
+        }
+        if (arc.isFinal()) {
+          longestOutput = fst.outputs.add(output, arc.nextFinalOutput());
+          longestMatch = j;
+        }
+      }
+
+      if (longestMatch >= 0) {
+        sb.delete(i, longestMatch + 1);
+        sb.insert(i, longestOutput);
+        i += (longestOutput.length - 1);
+      }
+    }
+  }
+
+  boolean mightReplaceChar(char c) {
+    return firstCharHashes.get(c % mod);
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -51,7 +51,6 @@ import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.IntsRefBuilder;
@ -60,11 +59,9 @@ import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.apache.lucene.util.automaton.RegExp;
-import org.apache.lucene.util.fst.CharSequenceOutputs;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.FSTCompiler;
 import org.apache.lucene.util.fst.IntSequenceOutputs;
-import org.apache.lucene.util.fst.Outputs;
 import org.apache.lucene.util.fst.Util;

 /** In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary. */
@ -172,13 +169,7 @@ public class Dictionary {
  int maxNGramSuggestions = Integer.MAX_VALUE;
  boolean onlyMaxDiff;
  char noSuggest, subStandard;
-
-  // FSTs used for ICONV/OCONV, output ord pointing to replacement text
-  FST<CharsRef> iconv;
-  FST<CharsRef> oconv;
-
-  boolean needsInputCleaning;
-  boolean needsOutputCleaning;
+  ConvTable iconv, oconv;

  // true if we can strip suffixes "down to nothing"
  boolean fullStrip;
@ -224,8 +215,6 @@ public class Dictionary {
      boolean ignoreCase)
      throws IOException, ParseException {
    this.ignoreCase = ignoreCase;
-    this.needsInputCleaning = ignoreCase;
-    this.needsOutputCleaning = false; // set if we have an OCONV

    try (BufferedInputStream affixStream =
        new BufferedInputStream(affix, MAX_PROLOGUE_SCAN_WINDOW) {
@ -379,16 +368,13 @@ public class Dictionary {
      } else if ("IGNORE".equals(firstWord)) {
        ignore = singleArgument(reader, line).toCharArray();
        Arrays.sort(ignore);
-        needsInputCleaning = true;
      } else if ("ICONV".equals(firstWord) || "OCONV".equals(firstWord)) {
        int num = parseNum(reader, line);
-        FST<CharsRef> res = parseConversions(reader, num);
+        ConvTable res = parseConversions(reader, num);
        if (line.startsWith("I")) {
          iconv = res;
-          needsInputCleaning |= iconv != null;
        } else {
          oconv = res;
-          needsOutputCleaning |= oconv != null;
        }
      } else if ("FULLSTRIP".equals(firstWord)) {
        fullStrip = true;
@ -803,9 +789,8 @@ public class Dictionary {
      affixData[dataStart + AFFIX_CONDITION] = (char) patternOrd;
      affixData[dataStart + AFFIX_APPEND] = (char) appendFlagsOrd;

-      if (needsInputCleaning) {
-        CharSequence cleaned = cleanInput(affixArg, sb);
-        affixArg = cleaned.toString();
+      if (needsInputCleaning(affixArg)) {
+        affixArg = cleanInput(affixArg, sb).toString();
      }

      if (isSuffix) {
@ -840,9 +825,9 @@ public class Dictionary {
    return affixData(affix, AFFIX_CONDITION) >>> 1;
  }

-  private FST<CharsRef> parseConversions(LineNumberReader reader, int num)
+  private ConvTable parseConversions(LineNumberReader reader, int num)
      throws IOException, ParseException {
-    Map<String, String> mappings = new TreeMap<>();
+    TreeMap<String, String> mappings = new TreeMap<>();

    for (int i = 0; i < num; i++) {
      String[] parts = splitBySpace(reader, reader.readLine(), 3);
@ -851,15 +836,7 @@ public class Dictionary {
      }
    }

-    Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
-    FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
-    IntsRefBuilder scratchInts = new IntsRefBuilder();
-    for (Map.Entry<String, String> entry : mappings.entrySet()) {
-      Util.toUTF16(entry.getKey(), scratchInts);
-      fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
-    }
-
-    return fstCompiler.compile();
+    return new ConvTable(mappings);
  }

  private static final byte[] BOM_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};
@ -1085,8 +1062,9 @@ public class Dictionary {
    int sep = flagSep < 0 ? morphSep : flagSep;

    CharSequence toWrite;
-    if (needsInputCleaning) {
-      cleanInput(line, sep, reuse);
+    String beforeSep = line.substring(0, sep);
+    if (needsInputCleaning(beforeSep)) {
+      cleanInput(beforeSep, reuse);
      reuse.append(line, sep, line.length());
      toWrite = reuse;
    } else {
@ -1571,14 +1549,28 @@ public class Dictionary {
    return flagLookup.hasFlag(entryId, flag);
  }

-  CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
-    return cleanInput(input, input.length(), reuse);
+  boolean mayNeedInputCleaning() {
+    return ignoreCase || ignore != null || iconv != null;
  }

-  private CharSequence cleanInput(CharSequence input, int prefixLength, StringBuilder reuse) {
+  boolean needsInputCleaning(CharSequence input) {
+    if (mayNeedInputCleaning()) {
+      for (int i = 0; i < input.length(); i++) {
+        char ch = input.charAt(i);
+        if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0
+            || ignoreCase && caseFold(ch) != ch
+            || iconv != null && iconv.mightReplaceChar(ch)) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
    reuse.setLength(0);

-    for (int i = 0; i < prefixLength; i++) {
+    for (int i = 0; i < input.length(); i++) {
      char ch = input.charAt(i);

      if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) {
@ -1594,11 +1586,7 @@ public class Dictionary {
    }

    if (iconv != null) {
-      try {
-        applyMappings(iconv, reuse);
-      } catch (IOException bogus) {
-        throw new RuntimeException(bogus);
-      }
+      iconv.applyMappings(reuse);
      if (ignoreCase) {
        for (int i = 0; i < reuse.length(); i++) {
          reuse.setCharAt(i, caseFold(reuse.charAt(i)));
@ -1624,44 +1612,6 @@ public class Dictionary {
    }
  }

-  // TODO: this could be more efficient!
-  static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
-    final FST.BytesReader bytesReader = fst.getBytesReader();
-    final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<>());
-    final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();
-
-    // temporary stuff
-    final FST.Arc<CharsRef> arc = new FST.Arc<>();
-    int longestMatch;
-    CharsRef longestOutput;
-
-    for (int i = 0; i < sb.length(); i++) {
-      arc.copyFrom(firstArc);
-      CharsRef output = NO_OUTPUT;
-      longestMatch = -1;
-      longestOutput = null;
-
-      for (int j = i; j < sb.length(); j++) {
-        char ch = sb.charAt(j);
-        if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
-          break;
-        } else {
-          output = fst.outputs.add(output, arc.output());
-        }
-        if (arc.isFinal()) {
-          longestOutput = fst.outputs.add(output, arc.nextFinalOutput());
-          longestMatch = j;
-        }
-      }
-
-      if (longestMatch >= 0) {
-        sb.delete(i, longestMatch + 1);
-        sb.insert(i, longestOutput);
-        i += (longestOutput.length - 1);
-      }
-    }
-  }
-
  /** Returns true if this dictionary was constructed with the {@code ignoreCase} option */
  public boolean getIgnoreCase() {
    return ignoreCase;
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
@ -22,7 +22,6 @@ import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
 import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
 import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;

-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.LinkedHashSet;
@ -72,7 +71,7 @@ public class Hunspell {
    checkCanceled.run();
    if (word.isEmpty()) return true;

-    if (dictionary.needsInputCleaning) {
+    if (dictionary.needsInputCleaning(word)) {
      word = dictionary.cleanInput(word, new StringBuilder()).toString();
    }

@ -479,7 +478,7 @@ public class Hunspell {
    checkCanceled.run();
    if (word.length() >= 100) return Collections.emptyList();

-    if (dictionary.needsInputCleaning) {
+    if (dictionary.needsInputCleaning(word)) {
      word = dictionary.cleanInput(word, new StringBuilder()).toString();
    }

@ -565,14 +564,10 @@ public class Hunspell {
  }

  private String cleanOutput(String s) {
-    if (!dictionary.needsOutputCleaning) return s;
+    if (dictionary.oconv == null) return s;

-    try {
    StringBuilder sb = new StringBuilder(s);
-      Dictionary.applyMappings(dictionary.oconv, sb);
+    dictionary.oconv.applyMappings(sb);
    return sb.toString();
-    } catch (IOException bogus) {
-      throw new RuntimeException(bogus);
-    }
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -83,15 +83,17 @@ final class Stemmer {
   */
  public List<CharsRef> stem(char[] word, int length) {

-    if (dictionary.needsInputCleaning) {
+    if (dictionary.mayNeedInputCleaning()) {
      scratchSegment.setLength(0);
      scratchSegment.append(word, 0, length);
+      if (dictionary.needsInputCleaning(scratchSegment)) {
        CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment);
        scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length());
        length = segment.length();
        segment.getChars(0, length, scratchBuffer, 0);
        word = scratchBuffer;
      }
+    }

    List<CharsRef> list = new ArrayList<>();
    RootProcessor processor =
@ -365,18 +367,14 @@ final class Stemmer {
  private CharsRef newStem(CharsRef stem, int morphDataId) {
    String exception = stemException(morphDataId);

-    if (dictionary.needsOutputCleaning) {
+    if (dictionary.oconv != null) {
      scratchSegment.setLength(0);
      if (exception != null) {
        scratchSegment.append(exception);
      } else {
        scratchSegment.append(stem.chars, stem.offset, stem.length);
      }
-      try {
-        Dictionary.applyMappings(dictionary.oconv, scratchSegment);
-      } catch (IOException bogus) {
-        throw new RuntimeException(bogus);
-      }
+      dictionary.oconv.applyMappings(scratchSegment);
      char[] cleaned = new char[scratchSegment.length()];
      scratchSegment.getChars(0, cleaned.length, cleaned, 0);
      return new CharsRef(cleaned, 0, cleaned.length);
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@ -24,19 +24,13 @@ import java.nio.charset.StandardCharsets;
 import java.text.ParseException;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.TreeMap;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 import org.apache.lucene.store.ByteBuffersDirectory;
 import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
-import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.fst.CharSequenceOutputs;
-import org.apache.lucene.util.fst.FST;
-import org.apache.lucene.util.fst.FSTCompiler;
-import org.apache.lucene.util.fst.Outputs;
-import org.apache.lucene.util.fst.Util;
 import org.junit.Test;

 public class TestDictionary extends LuceneTestCase {
@ -166,51 +160,36 @@ public class TestDictionary extends LuceneTestCase {
    assertTrue(dictStream.isClosed());
  }

-  public void testReplacements() throws Exception {
-    Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
-    FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
-    IntsRefBuilder scratchInts = new IntsRefBuilder();
-
-    // a -> b
-    Util.toUTF16("a", scratchInts);
-    fstCompiler.add(scratchInts.get(), new CharsRef("b"));
-
-    // ab -> c
-    Util.toUTF16("ab", scratchInts);
-    fstCompiler.add(scratchInts.get(), new CharsRef("c"));
-
-    // c -> de
-    Util.toUTF16("c", scratchInts);
-    fstCompiler.add(scratchInts.get(), new CharsRef("de"));
-
-    // def -> gh
-    Util.toUTF16("def", scratchInts);
-    fstCompiler.add(scratchInts.get(), new CharsRef("gh"));
-
-    FST<CharsRef> fst = fstCompiler.compile();
+  public void testReplacements() {
+    TreeMap<String, String> map = new TreeMap<>();
+    map.put("a", "b");
+    map.put("ab", "c");
+    map.put("c", "de");
+    map.put("def", "gh");
+    ConvTable table = new ConvTable(map);

    StringBuilder sb = new StringBuilder("atestanother");
-    Dictionary.applyMappings(fst, sb);
+    table.applyMappings(sb);
    assertEquals("btestbnother", sb.toString());

    sb = new StringBuilder("abtestanother");
-    Dictionary.applyMappings(fst, sb);
+    table.applyMappings(sb);
    assertEquals("ctestbnother", sb.toString());

    sb = new StringBuilder("atestabnother");
-    Dictionary.applyMappings(fst, sb);
+    table.applyMappings(sb);
    assertEquals("btestcnother", sb.toString());

    sb = new StringBuilder("abtestabnother");
-    Dictionary.applyMappings(fst, sb);
+    table.applyMappings(sb);
    assertEquals("ctestcnother", sb.toString());

    sb = new StringBuilder("abtestabcnother");
-    Dictionary.applyMappings(fst, sb);
+    table.applyMappings(sb);
    assertEquals("ctestcdenother", sb.toString());

    sb = new StringBuilder("defdefdefc");
-    Dictionary.applyMappings(fst, sb);
+    table.applyMappings(sb);
    assertEquals("ghghghde", sb.toString());
  }