hunspell: a couple micro-optimizations to speed up dictionary loading (#12825)

* hunspell: a couple micro-optimizations to speed up dictionary loading 1. sort by the whole entry without searching for separators first: WordStorage doesn't require strong lexicographic order (only something close to it), and the separators are anyway before any usual word characters 2. avoid stream overhead when adding an entry
2023-11-21 08:05:42 +01:00 · 2023-11-21 08:05:42 +01:00 · c23b3b3301
parent 7eb8f6ee00
commit c23b3b3301
3 changed files with 10 additions and 39 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -88,6 +88,8 @@ Optimizations

 * GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance (Peter Gromov)

+* GITHUB#12825: Hunspell: improved dictionary loading performance (Peter Gromov)
+
 * GITHUB#12372: Reduce allocation during HNSW construction (Jonathan Ellis)

 * GITHUB#12408: Lazy initialization improvements for Facets implementations when there are segments with no hits
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -56,6 +56,7 @@ import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefComparator;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.IntsRefBuilder;
@ -1087,42 +1088,7 @@ public class Dictionary {

  private String sortWordsOffline(
      Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
-    OfflineSorter sorter =
-        new OfflineSorter(
-            tempDir,
-            tempFileNamePrefix,
-            new Comparator<>() {
-              final BytesRef scratch1 = new BytesRef();
-              final BytesRef scratch2 = new BytesRef();
-
-              private void initScratch(BytesRef o, BytesRef scratch) {
-                scratch.bytes = o.bytes;
-                scratch.offset = o.offset;
-                scratch.length = o.length;
-
-                for (int i = scratch.length - 1; i >= 0; i--) {
-                  if (scratch.bytes[scratch.offset + i] == FLAG_SEPARATOR
-                      || scratch.bytes[scratch.offset + i] == MORPH_SEPARATOR) {
-                    scratch.length = i;
-                    break;
-                  }
-                }
-              }
-
-              @Override
-              public int compare(BytesRef o1, BytesRef o2) {
-                initScratch(o1, scratch1);
-                initScratch(o2, scratch2);
-
-                int cmp = scratch1.compareTo(scratch2);
-                if (cmp == 0) {
-                  // tie break on whole row
-                  return o1.compareTo(o2);
-                } else {
-                  return cmp;
-                }
-              }
-            });
+    var sorter = new OfflineSorter(tempDir, tempFileNamePrefix, BytesRefComparator.NATURAL);

    String sorted;
    boolean success = false;
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
@ -350,16 +350,19 @@ abstract class WordStorage {

      currentOrds.clear();
      boolean hasNonHidden = false;
+      boolean isSuggestible = false;
      for (char[] flags : group) {
        if (!hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
          hasNonHidden = true;
-          break;
+        }
+        if (!hasNoSuggestFlag(flags)) {
+          isSuggestible = true;
        }
      }

      for (int i = 0; i < group.size(); i++) {
        char[] flags = group.get(i);
-        if (hasNonHidden && hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
+        if (hasNonHidden && group.size() > 1 && hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
          continue;
        }

@ -388,7 +391,7 @@ abstract class WordStorage {

      int mask =
          (prevCode == 0 ? 0 : COLLISION_MASK)
-              | (group.stream().anyMatch(flags -> !hasNoSuggestFlag(flags)) ? SUGGESTIBLE_MASK : 0)
+              | (isSuggestible ? SUGGESTIBLE_MASK : 0)
              | Math.min(currentEntry.length(), MAX_STORED_LENGTH);
      hashTable[hash] = (mask << OFFSET_BITS) | pos;