hunspell: allow in-memory entry sorting for faster dictionary loading (#12834)

* hunspell: allow in-memory entry sorting for faster dictionary loading Co-authored-by: Dawid Weiss <dawid.weiss@gmail.com>
2023-11-24 08:21:43 +01:00 · 2023-11-24 08:21:43 +01:00 · f460d612b5
parent 981339be04
commit f460d612b5
5 changed files with 297 additions and 145 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -88,7 +88,8 @@ Optimizations

 * GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance (Peter Gromov)

-* GITHUB#12825: Hunspell: improved dictionary loading performance (Peter Gromov)
+* GITHUB#12825, GITHUB#12834: Hunspell: improved dictionary loading performance, allowed in-memory entry sorting.
+  (Peter Gromov)

 * GITHUB#12372: Reduce allocation during HNSW construction (Jonathan Ellis)

--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -50,19 +50,12 @@ import java.util.Set;
 import java.util.TreeMap;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
-import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.analysis.hunspell.SortingStrategy.EntryAccumulator;
+import org.apache.lucene.analysis.hunspell.SortingStrategy.EntrySupplier;
 import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.IOContext;
-import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefComparator;
-import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.IntsRefBuilder;
-import org.apache.lucene.util.OfflineSorter;
-import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
-import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.FSTCompiler;
 import org.apache.lucene.util.fst.IntSequenceOutputs;
@ -216,6 +209,25 @@ public class Dictionary {
      List<InputStream> dictionaries,
      boolean ignoreCase)
      throws IOException, ParseException {
+    this(affix, dictionaries, ignoreCase, SortingStrategy.offline(tempDir, tempFileNamePrefix));
+  }
+
+  /**
+   * Creates a new Dictionary containing the information read from the provided InputStreams to
+   * hunspell affix and dictionary files. You have to close the provided InputStreams yourself.
+   *
+   * @param affix InputStream for reading the hunspell affix file (won't be closed).
+   * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
+   * @param sortingStrategy the entry strategy for the dictionary loading
+   * @throws IOException Can be thrown while reading from the InputStreams
+   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
+   */
+  public Dictionary(
+      InputStream affix,
+      List<InputStream> dictionaries,
+      boolean ignoreCase,
+      SortingStrategy sortingStrategy)
+      throws IOException, ParseException {
    this.ignoreCase = ignoreCase;

    try (BufferedInputStream affixStream =
@ -251,10 +263,11 @@ public class Dictionary {
      readAffixFile(affixStream, decoder, flagEnumerator);

      // read dictionary entries
-      IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
-      int wordCount = mergeDictionaries(dictionaries, decoder, unsorted);
-      String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
-      words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator, wordCount);
+      EntryAccumulator acc = sortingStrategy.start();
+      mergeDictionaries(dictionaries, decoder, acc);
+      try (EntrySupplier sorted = acc.finishAndSort()) {
+        words = readSortedDictionaries(flagEnumerator, sorted);
+      }
      flagLookup = flagEnumerator.finish();
      aliases = null; // no longer needed
      morphAliases = null; // no longer needed
@ -986,52 +999,43 @@ public class Dictionary {
    }
  }

-  private int mergeDictionaries(
-      List<InputStream> dictionaries, CharsetDecoder decoder, IndexOutput output)
+  private void mergeDictionaries(
+      List<InputStream> dictionaries, CharsetDecoder decoder, EntryAccumulator acc)
      throws IOException {
    StringBuilder sb = new StringBuilder();
-    int wordCount = 0;
-    try (ByteSequencesWriter writer = new ByteSequencesWriter(output)) {
-      for (InputStream dictionary : dictionaries) {
-        BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
-        lines.readLine(); // first line is number of entries (approximately, sometimes)
+    for (InputStream dictionary : dictionaries) {
+      BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
+      lines.readLine(); // first line is number of entries (approximately, sometimes)

-        String line;
-        while ((line = lines.readLine()) != null) {
-          // wild and unpredictable code comment rules
-          if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') {
-            continue;
-          }
-          line = unescapeEntry(line);
-          // if we haven't seen any custom morphological data, try to parse one
-          if (!hasCustomMorphData) {
-            int morphStart = line.indexOf(MORPH_SEPARATOR);
-            if (morphStart >= 0) {
-              String data = line.substring(morphStart + 1);
-              hasCustomMorphData =
-                  splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
-            }
-          }
-
-          wordCount += writeNormalizedWordEntry(sb, writer, line);
+      String line;
+      while ((line = lines.readLine()) != null) {
+        // wild and unpredictable code comment rules
+        if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') {
+          continue;
        }
+        line = unescapeEntry(line);
+        // if we haven't seen any custom morphological data, try to parse one
+        if (!hasCustomMorphData) {
+          int morphStart = line.indexOf(MORPH_SEPARATOR);
+          if (morphStart >= 0) {
+            String data = line.substring(morphStart + 1);
+            hasCustomMorphData = splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
+          }
+        }
+
+        writeNormalizedWordEntry(sb, line, acc);
      }
-      CodecUtil.writeFooter(output);
    }
-    return wordCount;
  }

-  /**
-   * @return the number of word entries written
-   */
-  private int writeNormalizedWordEntry(StringBuilder reuse, ByteSequencesWriter writer, String line)
+  private void writeNormalizedWordEntry(StringBuilder reuse, String line, EntryAccumulator acc)
      throws IOException {
    int flagSep = line.indexOf(FLAG_SEPARATOR);
    int morphSep = line.indexOf(MORPH_SEPARATOR);
    assert morphSep > 0;
    assert morphSep > flagSep;
    int sep = flagSep < 0 ? morphSep : flagSep;
-    if (sep == 0) return 0;
+    if (sep == 0) return;

    CharSequence toWrite;
    String beforeSep = line.substring(0, sep);
@ -1045,19 +1049,16 @@ public class Dictionary {

    String written = toWrite.toString();
    sep = written.length() - (line.length() - sep);
-    writer.write(written.getBytes(StandardCharsets.UTF_8));
+    acc.addEntry(written);

    WordCase wordCase = WordCase.caseOf(written, sep);
    if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
-      addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep));
-      return 2;
+      addHiddenCapitalizedWord(reuse, acc, written.substring(0, sep), written.substring(sep));
    }
-    return 1;
  }

  private void addHiddenCapitalizedWord(
-      StringBuilder reuse, ByteSequencesWriter writer, String word, String afterSep)
-      throws IOException {
+      StringBuilder reuse, EntryAccumulator acc, String word, String afterSep) throws IOException {
    reuse.setLength(0);
    reuse.append(Character.toUpperCase(word.charAt(0)));
    for (int i = 1; i < word.length(); i++) {
@ -1066,7 +1067,7 @@ public class Dictionary {
    reuse.append(FLAG_SEPARATOR);
    reuse.append(HIDDEN_FLAG);
    reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
-    writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
+    acc.addEntry(reuse.toString());
  }

  String toLowerCase(String word) {
@ -1086,102 +1087,66 @@ public class Dictionary {
    return new String(chars);
  }

-  private String sortWordsOffline(
-      Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
-    var sorter = new OfflineSorter(tempDir, tempFileNamePrefix, BytesRefComparator.NATURAL);
-
-    String sorted;
-    boolean success = false;
-    try {
-      sorted = sorter.sort(unsorted.getName());
-      success = true;
-    } finally {
-      if (success) {
-        tempDir.deleteFile(unsorted.getName());
-      } else {
-        IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
-      }
-    }
-    return sorted;
-  }
-
-  private WordStorage readSortedDictionaries(
-      Directory tempDir, String sorted, FlagEnumerator flags, int wordCount) throws IOException {
-    boolean success = false;
-
+  private WordStorage readSortedDictionaries(FlagEnumerator flags, EntrySupplier sorted)
+      throws IOException {
    Map<String, Integer> morphIndices = new HashMap<>();

    WordStorage.Builder builder =
        new WordStorage.Builder(
-            wordCount, hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags());
+            sorted.wordCount(), hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags());

-    try (ByteSequencesReader reader =
-        new ByteSequencesReader(tempDir.openChecksumInput(sorted), sorted)) {
+    // TODO: the flags themselves can be double-chars (long) or also numeric
+    // either way the trick is to encode them as char... but they must be parsed differently

-      // TODO: the flags themselves can be double-chars (long) or also numeric
-      // either way the trick is to encode them as char... but they must be parsed differently
+    while (true) {
+      String line = sorted.next();
+      if (line == null) break;

-      while (true) {
-        BytesRef scratch = reader.next();
-        if (scratch == null) {
-          break;
-        }
+      String entry;
+      char[] wordForm;
+      int end;

-        String line = scratch.utf8ToString();
-        String entry;
-        char[] wordForm;
-        int end;
-
-        int flagSep = line.indexOf(FLAG_SEPARATOR);
-        if (flagSep == -1) {
-          wordForm = NOFLAGS;
-          end = line.indexOf(MORPH_SEPARATOR);
-          entry = line.substring(0, end);
-        } else {
-          end = line.indexOf(MORPH_SEPARATOR);
-          boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
-          String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip();
-          if (aliasCount > 0 && !flagPart.isEmpty()) {
-            flagPart = getAliasValue(Integer.parseInt(flagPart));
-          }
-
-          wordForm = flagParsingStrategy.parseFlags(flagPart);
-          if (hidden) {
-            wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1);
-            wordForm[wordForm.length - 1] = HIDDEN_FLAG;
-          }
-          entry = line.substring(0, flagSep);
-        }
-
-        if (entry.isEmpty()) continue;
-
-        int morphDataID = 0;
-        if (end + 1 < line.length()) {
-          List<String> morphFields = readMorphFields(entry, line.substring(end + 1));
-          if (!morphFields.isEmpty()) {
-            morphFields.sort(Comparator.naturalOrder());
-            morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields));
-          }
-        }
-
-        builder.add(entry, wordForm, morphDataID);
-      }
-
-      // finalize last entry
-      success = true;
-      return new WordStorage(builder) {
-        @Override
-        char caseFold(char c) {
-          return Dictionary.this.caseFold(c);
-        }
-      };
-    } finally {
-      if (success) {
-        tempDir.deleteFile(sorted);
+      int flagSep = line.indexOf(FLAG_SEPARATOR);
+      if (flagSep == -1) {
+        wordForm = NOFLAGS;
+        end = line.indexOf(MORPH_SEPARATOR);
+        entry = line.substring(0, end);
      } else {
-        IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
+        end = line.indexOf(MORPH_SEPARATOR);
+        boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
+        String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip();
+        if (aliasCount > 0 && !flagPart.isEmpty()) {
+          flagPart = getAliasValue(Integer.parseInt(flagPart));
+        }
+
+        wordForm = flagParsingStrategy.parseFlags(flagPart);
+        if (hidden) {
+          wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1);
+          wordForm[wordForm.length - 1] = HIDDEN_FLAG;
+        }
+        entry = line.substring(0, flagSep);
      }
+
+      if (entry.isEmpty()) continue;
+
+      int morphDataID = 0;
+      if (end + 1 < line.length()) {
+        List<String> morphFields = readMorphFields(entry, line.substring(end + 1));
+        if (!morphFields.isEmpty()) {
+          morphFields.sort(Comparator.naturalOrder());
+          morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields));
+        }
+      }
+
+      builder.add(entry, wordForm, morphDataID);
    }
+
+    return new WordStorage(builder) {
+      @Override
+      char caseFold(char c) {
+        return Dictionary.this.caseFold(c);
+      }
+    };
  }

  /**
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SortingStrategy.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SortingStrategy.java
@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefComparator;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.OfflineSorter;
+import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
+import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
+
+/**
+ * The strategy defining how a Hunspell dictionary should be loaded, with different tradeoffs. The
+ * entries should be sorted in a special way, and this can be done either in-memory (faster, but
+ * temporarily allocating more memory) or using disk (slower, but not needing much memory).
+ *
+ * @see #offline(Directory, String)
+ * @see #inMemory()
+ */
+public abstract class SortingStrategy {
+
+  abstract EntryAccumulator start() throws IOException;
+
+  interface EntryAccumulator {
+
+    void addEntry(String entry) throws IOException;
+
+    EntrySupplier finishAndSort() throws IOException;
+  }
+
+  interface EntrySupplier extends Closeable {
+    int wordCount();
+
+    /** The next line or {@code null} if the end is reached */
+    String next() throws IOException;
+  }
+
+  /**
+   * An "offline" strategy that creates temporary files in the given directory and uses them for
+   * sorting with {@link OfflineSorter}. It's slower than {@link #inMemory()}, but doesn't need to
+   * load the entire dictionary into memory.
+   */
+  public static SortingStrategy offline(Directory tempDir, String tempFileNamePrefix) {
+    return new SortingStrategy() {
+      @Override
+      EntryAccumulator start() throws IOException {
+        IndexOutput output = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
+        ByteSequencesWriter writer = new ByteSequencesWriter(output);
+        return new EntryAccumulator() {
+          int wordCount = 0;
+
+          @Override
+          public void addEntry(String entry) throws IOException {
+            wordCount++;
+            writer.write(entry.getBytes(StandardCharsets.UTF_8));
+          }
+
+          @Override
+          public EntrySupplier finishAndSort() throws IOException {
+            CodecUtil.writeFooter(output);
+            writer.close();
+            String sortedFile = sortWordsOffline();
+            ByteSequencesReader reader =
+                new ByteSequencesReader(tempDir.openChecksumInput(sortedFile), sortedFile);
+            return new EntrySupplier() {
+              boolean success = false;
+
+              @Override
+              public int wordCount() {
+                return wordCount;
+              }
+
+              @Override
+              public String next() throws IOException {
+                BytesRef scratch = reader.next();
+                if (scratch == null) {
+                  success = true;
+                  return null;
+                }
+                return scratch.utf8ToString();
+              }
+
+              @Override
+              public void close() throws IOException {
+                reader.close();
+                if (success) {
+                  tempDir.deleteFile(sortedFile);
+                } else {
+                  IOUtils.deleteFilesIgnoringExceptions(tempDir, sortedFile);
+                }
+              }
+            };
+          }
+
+          private String sortWordsOffline() throws IOException {
+            var sorter = new OfflineSorter(tempDir, tempFileNamePrefix, BytesRefComparator.NATURAL);
+
+            String sorted;
+            boolean success = false;
+            try {
+              sorted = sorter.sort(output.getName());
+              success = true;
+            } finally {
+              if (success) {
+                tempDir.deleteFile(output.getName());
+              } else {
+                IOUtils.deleteFilesIgnoringExceptions(tempDir, output.getName());
+              }
+            }
+            return sorted;
+          }
+        };
+      }
+    };
+  }
+
+  /**
+   * The strategy that loads all entries as {@link String} objects and sorts them in memory. The
+   * entries are then stored in a more compressed way, and the strings are gc-ed, but the loading
+   * itself needs {@code O(dictionary_size)} memory.
+   */
+  public static SortingStrategy inMemory() {
+    return new SortingStrategy() {
+      @Override
+      EntryAccumulator start() {
+        List<String> entries = new ArrayList<>();
+        return new EntryAccumulator() {
+          @Override
+          public void addEntry(String entry) {
+            entries.add(entry);
+          }
+
+          @Override
+          public EntrySupplier finishAndSort() {
+            entries.sort(Comparator.naturalOrder());
+            return new EntrySupplier() {
+              int i = 0;
+
+              @Override
+              public int wordCount() {
+                return entries.size();
+              }
+
+              @Override
+              public String next() {
+                return i < entries.size() ? entries.get(i++) : null;
+              }
+
+              @Override
+              public void close() {}
+            };
+          }
+        };
+      }
+    };
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
@ -41,7 +41,6 @@ import java.util.concurrent.atomic.AtomicLong;
 import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
-import org.apache.lucene.tests.store.BaseDirectoryWrapper;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks;
 import org.apache.lucene.tests.util.RamUsageTester;
@ -72,9 +71,8 @@ public class TestAllDictionaries extends LuceneTestCase {
    Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic");
    assert Files.exists(dic) : dic;
    try (InputStream dictionary = Files.newInputStream(dic);
-        InputStream affix = Files.newInputStream(aff);
-        BaseDirectoryWrapper tempDir = newDirectory()) {
-      return new Dictionary(tempDir, "dictionary", affix, dictionary) {
+        InputStream affix = Files.newInputStream(aff)) {
+      return new Dictionary(affix, List.of(dictionary), false, SortingStrategy.inMemory()) {
        @Override
        protected boolean tolerateAffixRuleCountMismatches() {
          return true;
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
@ -256,15 +256,22 @@ public class TestSpellChecking extends LuceneTestCase {
  }

  static void checkSpellCheckerExpectations(Path basePath) throws IOException, ParseException {
-    InputStream affixStream = Files.newInputStream(Path.of(basePath.toString() + ".aff"));
+    checkSpellCheckerExpectations(
+        basePath, SortingStrategy.offline(new ByteBuffersDirectory(), "dictionary"));
+    checkSpellCheckerExpectations(basePath, SortingStrategy.inMemory());
+  }
+
+  private static void checkSpellCheckerExpectations(Path basePath, SortingStrategy strategy)
+      throws IOException, ParseException {
+    Path affFile = Path.of(basePath + ".aff");
    Path dicFile = Path.of(basePath + ".dic");
+    InputStream affixStream = Files.newInputStream(affFile);
    InputStream dictStream = Files.newInputStream(dicFile);

    Hunspell speller;
    Map<String, Suggester> suggesters = new LinkedHashMap<>();
    try {
-      Dictionary dictionary =
-          new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
+      Dictionary dictionary = new Dictionary(affixStream, List.of(dictStream), false, strategy);
      speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
      Suggester suggester = new Suggester(dictionary);
      suggesters.put("default", suggester);