From 58e3b7a8542fa8d94a77215fdf13d575a5657546 Mon Sep 17 00:00:00 2001
From: Peter Gromov <peter@jetbrains.com>
Date: Fri, 19 Feb 2021 20:10:06 +0100
Subject: [PATCH] LUCENE-9790: Hunspell: avoid slow dictionary lookup if the
 word's hash isn't there (#2405)

---
 .../lucene/analysis/hunspell/Dictionary.java  | 39 ++++++++++++-------
 .../hunspell/TestAllDictionaries.java         |  3 +-
 .../apache/lucene/analysis/CharArrayMap.java  | 12 +++---
 .../java/org/apache/lucene/util/CharsRef.java | 13 +++++--
 4 files changed, 43 insertions(+), 24 deletions(-)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 7b1fdf96589..5a23d76b218 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -51,6 +51,8 @@ import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.IntsRefBuilder;
@@ -98,6 +100,9 @@ public class Dictionary {
    */
   FST<IntsRef> words;
 
+  /** A Bloom filter over {@link #words} to avoid unnecessary expensive FST traversals */
+  FixedBitSet wordHashes;
+
   /**
    * The list of unique flagsets (wordforms). theoretically huge, but practically small (for Polish
    * this is 756), otherwise humans wouldn't be able to deal with it either.
@@ -249,7 +254,9 @@ public class Dictionary {
       readAffixFile(affixStream, decoder, flagEnumerator);
 
       // read dictionary entries
-      IndexOutput unsorted = mergeDictionaries(tempDir, tempFileNamePrefix, dictionaries, decoder);
+      IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
+      int wordCount = mergeDictionaries(dictionaries, decoder, unsorted);
+      wordHashes = new FixedBitSet(Integer.highestOneBit(wordCount * 10));
       String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
       words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator);
       flagLookup = flagEnumerator.finish();
@@ -264,6 +271,11 @@ public class Dictionary {
 
   /** Looks up Hunspell word forms from the dictionary */
   IntsRef lookupWord(char[] word, int offset, int length) {
+    int hash = CharsRef.stringHashCode(word, offset, length);
+    if (!wordHashes.get(Math.abs(hash) % wordHashes.length())) {
+      return null;
+    }
+
     return lookup(words, word, offset, length);
   }
 
@@ -1015,15 +1027,12 @@ public class Dictionary {
     }
   }
 
-  private IndexOutput mergeDictionaries(
-      Directory tempDir,
-      String tempFileNamePrefix,
-      List<InputStream> dictionaries,
-      CharsetDecoder decoder)
+  private int mergeDictionaries(
+      List<InputStream> dictionaries, CharsetDecoder decoder, IndexOutput output)
       throws IOException {
     StringBuilder sb = new StringBuilder();
-    IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
-    try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
+    int wordCount = 0;
+    try (ByteSequencesWriter writer = new ByteSequencesWriter(output)) {
       for (InputStream dictionary : dictionaries) {
         BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
         lines.readLine(); // first line is number of entries (approximately, sometimes)
@@ -1045,16 +1054,17 @@ public class Dictionary {
             }
           }
 
-          writeNormalizedWordEntry(sb, writer, line);
+          wordCount += writeNormalizedWordEntry(sb, writer, line);
         }
       }
-      CodecUtil.writeFooter(unsorted);
+      CodecUtil.writeFooter(output);
     }
-    return unsorted;
+    return wordCount;
   }
 
-  private void writeNormalizedWordEntry(
-      StringBuilder reuse, ByteSequencesWriter writer, String line) throws IOException {
+  /** @return the number of word entries written */
+  private int writeNormalizedWordEntry(StringBuilder reuse, ByteSequencesWriter writer, String line)
+      throws IOException {
     int flagSep = line.indexOf(FLAG_SEPARATOR);
     int morphSep = line.indexOf(MORPH_SEPARATOR);
     assert morphSep > 0;
@@ -1078,7 +1088,9 @@ public class Dictionary {
     WordCase wordCase = WordCase.caseOf(written, sep);
     if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
       addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep));
+      return 2;
     }
+    return 1;
   }
 
   private void addHiddenCapitalizedWord(
@@ -1221,6 +1233,7 @@ public class Dictionary {
           }
         }
 
+        wordHashes.set(Math.abs(entry.hashCode()) % wordHashes.length());
         grouper.add(entry, wordForm, morphDataID);
       }
 
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
index acef45bb4e1..6fac33d2d9f 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
@@ -160,7 +160,8 @@ public class TestAllDictionaries extends LuceneTestCase {
           try {
             Dictionary dic = loadDictionary(aff);
             totalMemory.addAndGet(RamUsageTester.sizeOf(dic));
-            totalWords.addAndGet(RamUsageTester.sizeOf(dic.words));
+            totalWords.addAndGet(
+                RamUsageTester.sizeOf(dic.words) + RamUsageTester.sizeOf(dic.wordHashes));
             System.out.println(aff + "\t" + memoryUsageSummary(dic));
           } catch (Throwable e) {
             failures.add(aff);
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java b/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java
index ea94f96d0b0..eace7d0edb3 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java
@@ -22,6 +22,7 @@ import java.util.Arrays;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
+import org.apache.lucene.util.CharsRef;
 
 /**
  * A simple class that stores key Strings as char[]'s in a hash table. Note that this is not a
@@ -266,20 +267,17 @@ public class CharArrayMap<V> extends AbstractMap<Object, V> {
 
   private int getHashCode(char[] text, int offset, int len) {
     if (text == null) throw new NullPointerException();
-    int code = 0;
-    final int stop = offset + len;
     if (ignoreCase) {
+      int stop = offset + len;
+      int code = 0;
       for (int i = offset; i < stop; ) {
         final int codePointAt = Character.codePointAt(text, i, stop);
         code = code * 31 + Character.toLowerCase(codePointAt);
         i += Character.charCount(codePointAt);
       }
-    } else {
-      for (int i = offset; i < stop; i++) {
-        code = code * 31 + text[i];
-      }
+      return code;
     }
-    return code;
+    return CharsRef.stringHashCode(text, offset, len);
   }
 
   private int getHashCode(CharSequence text) {
diff --git a/lucene/core/src/java/org/apache/lucene/util/CharsRef.java b/lucene/core/src/java/org/apache/lucene/util/CharsRef.java
index cf1f96969ea..395bd865330 100644
--- a/lucene/core/src/java/org/apache/lucene/util/CharsRef.java
+++ b/lucene/core/src/java/org/apache/lucene/util/CharsRef.java
@@ -74,11 +74,18 @@ public final class CharsRef implements Comparable<CharsRef>, CharSequence, Clone
 
   @Override
   public int hashCode() {
-    final int prime = 31;
+    return stringHashCode(chars, offset, length);
+  }
+
+  /**
+   * @return the hash code of the given char sub-array, calculated by {@link String#hashCode()}
+   *     specification
+   */
+  public static int stringHashCode(char[] chars, int offset, int length) {
+    int end = offset + length;
     int result = 0;
-    final int end = offset + length;
     for (int i = offset; i < end; i++) {
-      result = prime * result + chars[i];
+      result = 31 * result + chars[i];
     }
     return result;
   }