LUCENE-9790: Hunspell: avoid slow dictionary lookup if the word's hash isn't there (#2405)

2021-02-19 20:10:06 +01:00 · 2021-02-19 20:10:06 +01:00 · 58e3b7a854
parent 4b3fb1e065
commit 58e3b7a854
4 changed files with 43 additions and 24 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -51,6 +51,8 @@ import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.IntsRefBuilder;
@ -98,6 +100,9 @@ public class Dictionary {
   */
  FST<IntsRef> words;

+  /** A Bloom filter over {@link #words} to avoid unnecessary expensive FST traversals */
+  FixedBitSet wordHashes;
+
  /**
   * The list of unique flagsets (wordforms). theoretically huge, but practically small (for Polish
   * this is 756), otherwise humans wouldn't be able to deal with it either.
@ -249,7 +254,9 @@ public class Dictionary {
      readAffixFile(affixStream, decoder, flagEnumerator);

      // read dictionary entries
-      IndexOutput unsorted = mergeDictionaries(tempDir, tempFileNamePrefix, dictionaries, decoder);
+      IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
+      int wordCount = mergeDictionaries(dictionaries, decoder, unsorted);
+      wordHashes = new FixedBitSet(Integer.highestOneBit(wordCount * 10));
      String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
      words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator);
      flagLookup = flagEnumerator.finish();
@ -264,6 +271,11 @@ public class Dictionary {

  /** Looks up Hunspell word forms from the dictionary */
  IntsRef lookupWord(char[] word, int offset, int length) {
+    int hash = CharsRef.stringHashCode(word, offset, length);
+    if (!wordHashes.get(Math.abs(hash) % wordHashes.length())) {
+      return null;
+    }
+
    return lookup(words, word, offset, length);
  }

@ -1015,15 +1027,12 @@ public class Dictionary {
    }
  }

-  private IndexOutput mergeDictionaries(
-      Directory tempDir,
-      String tempFileNamePrefix,
-      List<InputStream> dictionaries,
-      CharsetDecoder decoder)
+  private int mergeDictionaries(
+      List<InputStream> dictionaries, CharsetDecoder decoder, IndexOutput output)
      throws IOException {
    StringBuilder sb = new StringBuilder();
-    IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
-    try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
+    int wordCount = 0;
+    try (ByteSequencesWriter writer = new ByteSequencesWriter(output)) {
      for (InputStream dictionary : dictionaries) {
        BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
        lines.readLine(); // first line is number of entries (approximately, sometimes)
@ -1045,16 +1054,17 @@ public class Dictionary {
            }
          }

-          writeNormalizedWordEntry(sb, writer, line);
+          wordCount += writeNormalizedWordEntry(sb, writer, line);
        }
      }
-      CodecUtil.writeFooter(unsorted);
+      CodecUtil.writeFooter(output);
    }
-    return unsorted;
+    return wordCount;
  }

-  private void writeNormalizedWordEntry(
-      StringBuilder reuse, ByteSequencesWriter writer, String line) throws IOException {
+  /** @return the number of word entries written */
+  private int writeNormalizedWordEntry(StringBuilder reuse, ByteSequencesWriter writer, String line)
+      throws IOException {
    int flagSep = line.indexOf(FLAG_SEPARATOR);
    int morphSep = line.indexOf(MORPH_SEPARATOR);
    assert morphSep > 0;
@ -1078,7 +1088,9 @@ public class Dictionary {
    WordCase wordCase = WordCase.caseOf(written, sep);
    if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
      addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep));
+      return 2;
    }
+    return 1;
  }

  private void addHiddenCapitalizedWord(
@ -1221,6 +1233,7 @@ public class Dictionary {
          }
        }

+        wordHashes.set(Math.abs(entry.hashCode()) % wordHashes.length());
        grouper.add(entry, wordForm, morphDataID);
      }

--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
@ -160,7 +160,8 @@ public class TestAllDictionaries extends LuceneTestCase {
          try {
            Dictionary dic = loadDictionary(aff);
            totalMemory.addAndGet(RamUsageTester.sizeOf(dic));
-            totalWords.addAndGet(RamUsageTester.sizeOf(dic.words));
+            totalWords.addAndGet(
+                RamUsageTester.sizeOf(dic.words) + RamUsageTester.sizeOf(dic.wordHashes));
            System.out.println(aff + "\t" + memoryUsageSummary(dic));
          } catch (Throwable e) {
            failures.add(aff);
--- a/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java
@ -22,6 +22,7 @@ import java.util.Arrays;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
+import org.apache.lucene.util.CharsRef;

 /**
 * A simple class that stores key Strings as char[]'s in a hash table. Note that this is not a
@ -266,20 +267,17 @@ public class CharArrayMap<V> extends AbstractMap<Object, V> {

  private int getHashCode(char[] text, int offset, int len) {
    if (text == null) throw new NullPointerException();
-    int code = 0;
-    final int stop = offset + len;
    if (ignoreCase) {
+      int stop = offset + len;
+      int code = 0;
      for (int i = offset; i < stop; ) {
        final int codePointAt = Character.codePointAt(text, i, stop);
        code = code * 31 + Character.toLowerCase(codePointAt);
        i += Character.charCount(codePointAt);
      }
-    } else {
-      for (int i = offset; i < stop; i++) {
-        code = code * 31 + text[i];
-      }
+      return code;
    }
-    return code;
+    return CharsRef.stringHashCode(text, offset, len);
  }

  private int getHashCode(CharSequence text) {
--- a/lucene/core/src/java/org/apache/lucene/util/CharsRef.java
+++ b/lucene/core/src/java/org/apache/lucene/util/CharsRef.java
@ -74,11 +74,18 @@ public final class CharsRef implements Comparable<CharsRef>, CharSequence, Clone

  @Override
  public int hashCode() {
-    final int prime = 31;
+    return stringHashCode(chars, offset, length);
+  }
+
+  /**
+   * @return the hash code of the given char sub-array, calculated by {@link String#hashCode()}
+   *     specification
+   */
+  public static int stringHashCode(char[] chars, int offset, int length) {
+    int end = offset + length;
    int result = 0;
-    final int end = offset + length;
    for (int i = offset; i < end; i++) {
-      result = prime * result + chars[i];
+      result = 31 * result + chars[i];
    }
    return result;
  }