hunspell: allow in-memory entry sorting for faster dictionary loading (#12834)

* hunspell: allow in-memory entry sorting for faster dictionary loading Co-authored-by: Dawid Weiss <dawid.weiss@gmail.com>
2023-11-24 08:21:43 +01:00 · 2023-11-24 08:21:43 +01:00 · f460d612b5
parent 981339be04
commit f460d612b5
5 changed files with 297 additions and 145 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -88,7 +88,8 @@ Optimizations
 * GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance (Peter Gromov)
-* GITHUB#12825: Hunspell: improved dictionary loading performance (Peter Gromov)
+* GITHUB#12825, GITHUB#12834: Hunspell: improved dictionary loading performance, allowed in-memory entry sorting.
  (Peter Gromov)
 * GITHUB#12372: Reduce allocation during HNSW construction (Jonathan Ellis)
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -50,19 +50,12 @@ import java.util.Set;
 import java.util.TreeMap;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
-import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.analysis.hunspell.SortingStrategy.EntryAccumulator;
 import org.apache.lucene.analysis.hunspell.SortingStrategy.EntrySupplier;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefComparator;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.OfflineSorter;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.FSTCompiler;
 import org.apache.lucene.util.fst.IntSequenceOutputs;
@ -216,6 +209,25 @@ public class Dictionary {
      List<InputStream> dictionaries,
      boolean ignoreCase)
      throws IOException, ParseException {
    this(affix, dictionaries, ignoreCase, SortingStrategy.offline(tempDir, tempFileNamePrefix));
  }
  /**
   * Creates a new Dictionary containing the information read from the provided InputStreams to
   * hunspell affix and dictionary files. You have to close the provided InputStreams yourself.
   *
   * @param affix InputStream for reading the hunspell affix file (won't be closed).
   * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
   * @param sortingStrategy the entry strategy for the dictionary loading
   * @throws IOException Can be thrown while reading from the InputStreams
   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
   */
  public Dictionary(
      InputStream affix,
      List<InputStream> dictionaries,
      boolean ignoreCase,
      SortingStrategy sortingStrategy)
      throws IOException, ParseException {
    this.ignoreCase = ignoreCase;
    try (BufferedInputStream affixStream =
@ -251,10 +263,11 @@ public class Dictionary {
      readAffixFile(affixStream, decoder, flagEnumerator);
      // read dictionary entries
-      IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
+      EntryAccumulator acc = sortingStrategy.start();
-      int wordCount = mergeDictionaries(dictionaries, decoder, unsorted);
+      mergeDictionaries(dictionaries, decoder, acc);
-      String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
+      try (EntrySupplier sorted = acc.finishAndSort()) {
-      words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator, wordCount);
+        words = readSortedDictionaries(flagEnumerator, sorted);
      }
      flagLookup = flagEnumerator.finish();
      aliases = null; // no longer needed
      morphAliases = null; // no longer needed
@ -986,52 +999,43 @@ public class Dictionary {
    }
  }
-  private int mergeDictionaries(
+  private void mergeDictionaries(
-      List<InputStream> dictionaries, CharsetDecoder decoder, IndexOutput output)
+      List<InputStream> dictionaries, CharsetDecoder decoder, EntryAccumulator acc)
      throws IOException {
    StringBuilder sb = new StringBuilder();
-    int wordCount = 0;
+    for (InputStream dictionary : dictionaries) {
-    try (ByteSequencesWriter writer = new ByteSequencesWriter(output)) {
+      BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
-      for (InputStream dictionary : dictionaries) {
+      lines.readLine(); // first line is number of entries (approximately, sometimes)
        BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
        lines.readLine(); // first line is number of entries (approximately, sometimes)
-        String line;
+      String line;
-        while ((line = lines.readLine()) != null) {
+      while ((line = lines.readLine()) != null) {
-          // wild and unpredictable code comment rules
+        // wild and unpredictable code comment rules
-          if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') {
+        if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') {
-            continue;
+          continue;
          }
          line = unescapeEntry(line);
          // if we haven't seen any custom morphological data, try to parse one
          if (!hasCustomMorphData) {
            int morphStart = line.indexOf(MORPH_SEPARATOR);
            if (morphStart >= 0) {
              String data = line.substring(morphStart + 1);
              hasCustomMorphData =
                  splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
            }
          }
          wordCount += writeNormalizedWordEntry(sb, writer, line);
        }
        line = unescapeEntry(line);
        // if we haven't seen any custom morphological data, try to parse one
        if (!hasCustomMorphData) {
          int morphStart = line.indexOf(MORPH_SEPARATOR);
          if (morphStart >= 0) {
            String data = line.substring(morphStart + 1);
            hasCustomMorphData = splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
          }
        }
        writeNormalizedWordEntry(sb, line, acc);
      }
      CodecUtil.writeFooter(output);
    }
    return wordCount;
  }
-  /**
+  private void writeNormalizedWordEntry(StringBuilder reuse, String line, EntryAccumulator acc)
   * @return the number of word entries written
   */
  private int writeNormalizedWordEntry(StringBuilder reuse, ByteSequencesWriter writer, String line)
      throws IOException {
    int flagSep = line.indexOf(FLAG_SEPARATOR);
    int morphSep = line.indexOf(MORPH_SEPARATOR);
    assert morphSep > 0;
    assert morphSep > flagSep;
    int sep = flagSep < 0 ? morphSep : flagSep;
-    if (sep == 0) return 0;
+    if (sep == 0) return;
    CharSequence toWrite;
    String beforeSep = line.substring(0, sep);
@ -1045,19 +1049,16 @@ public class Dictionary {
    String written = toWrite.toString();
    sep = written.length() - (line.length() - sep);
-    writer.write(written.getBytes(StandardCharsets.UTF_8));
+    acc.addEntry(written);
    WordCase wordCase = WordCase.caseOf(written, sep);
    if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
-      addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep));
+      addHiddenCapitalizedWord(reuse, acc, written.substring(0, sep), written.substring(sep));
      return 2;
    }
    return 1;
  }
  private void addHiddenCapitalizedWord(
-      StringBuilder reuse, ByteSequencesWriter writer, String word, String afterSep)
+      StringBuilder reuse, EntryAccumulator acc, String word, String afterSep) throws IOException {
      throws IOException {
    reuse.setLength(0);
    reuse.append(Character.toUpperCase(word.charAt(0)));
    for (int i = 1; i < word.length(); i++) {
@ -1066,7 +1067,7 @@ public class Dictionary {
    reuse.append(FLAG_SEPARATOR);
    reuse.append(HIDDEN_FLAG);
    reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
-    writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
+    acc.addEntry(reuse.toString());
  }
  String toLowerCase(String word) {
@ -1086,102 +1087,66 @@ public class Dictionary {
    return new String(chars);
  }
-  private String sortWordsOffline(
+  private WordStorage readSortedDictionaries(FlagEnumerator flags, EntrySupplier sorted)
-      Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
+      throws IOException {
    var sorter = new OfflineSorter(tempDir, tempFileNamePrefix, BytesRefComparator.NATURAL);
    String sorted;
    boolean success = false;
    try {
      sorted = sorter.sort(unsorted.getName());
      success = true;
    } finally {
      if (success) {
        tempDir.deleteFile(unsorted.getName());
      } else {
        IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
      }
    }
    return sorted;
  }
  private WordStorage readSortedDictionaries(
      Directory tempDir, String sorted, FlagEnumerator flags, int wordCount) throws IOException {
    boolean success = false;
    Map<String, Integer> morphIndices = new HashMap<>();
    WordStorage.Builder builder =
        new WordStorage.Builder(
-            wordCount, hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags());
+            sorted.wordCount(), hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags());
-    try (ByteSequencesReader reader =
+    // TODO: the flags themselves can be double-chars (long) or also numeric
-        new ByteSequencesReader(tempDir.openChecksumInput(sorted), sorted)) {
+    // either way the trick is to encode them as char... but they must be parsed differently
-      // TODO: the flags themselves can be double-chars (long) or also numeric
+    while (true) {
-      // either way the trick is to encode them as char... but they must be parsed differently
+      String line = sorted.next();
      if (line == null) break;
-      while (true) {
+      String entry;
-        BytesRef scratch = reader.next();
+      char[] wordForm;
-        if (scratch == null) {
+      int end;
          break;
        }
-        String line = scratch.utf8ToString();
+      int flagSep = line.indexOf(FLAG_SEPARATOR);
-        String entry;
+      if (flagSep == -1) {
-        char[] wordForm;
+        wordForm = NOFLAGS;
-        int end;
+        end = line.indexOf(MORPH_SEPARATOR);
-
+        entry = line.substring(0, end);
        int flagSep = line.indexOf(FLAG_SEPARATOR);
        if (flagSep == -1) {
          wordForm = NOFLAGS;
          end = line.indexOf(MORPH_SEPARATOR);
          entry = line.substring(0, end);
        } else {
          end = line.indexOf(MORPH_SEPARATOR);
          boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
          String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip();
          if (aliasCount > 0 && !flagPart.isEmpty()) {
            flagPart = getAliasValue(Integer.parseInt(flagPart));
          }
          wordForm = flagParsingStrategy.parseFlags(flagPart);
          if (hidden) {
            wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1);
            wordForm[wordForm.length - 1] = HIDDEN_FLAG;
          }
          entry = line.substring(0, flagSep);
        }
        if (entry.isEmpty()) continue;
        int morphDataID = 0;
        if (end + 1 < line.length()) {
          List<String> morphFields = readMorphFields(entry, line.substring(end + 1));
          if (!morphFields.isEmpty()) {
            morphFields.sort(Comparator.naturalOrder());
            morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields));
          }
        }
        builder.add(entry, wordForm, morphDataID);
      }
      // finalize last entry
      success = true;
      return new WordStorage(builder) {
        @Override
        char caseFold(char c) {
          return Dictionary.this.caseFold(c);
        }
      };
    } finally {
      if (success) {
        tempDir.deleteFile(sorted);
      } else {
-        IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
+        end = line.indexOf(MORPH_SEPARATOR);
        boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
        String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip();
        if (aliasCount > 0 && !flagPart.isEmpty()) {
          flagPart = getAliasValue(Integer.parseInt(flagPart));
        }
        wordForm = flagParsingStrategy.parseFlags(flagPart);
        if (hidden) {
          wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1);
          wordForm[wordForm.length - 1] = HIDDEN_FLAG;
        }
        entry = line.substring(0, flagSep);
      }
      if (entry.isEmpty()) continue;
      int morphDataID = 0;
      if (end + 1 < line.length()) {
        List<String> morphFields = readMorphFields(entry, line.substring(end + 1));
        if (!morphFields.isEmpty()) {
          morphFields.sort(Comparator.naturalOrder());
          morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields));
        }
      }
      builder.add(entry, wordForm, morphDataID);
    }
    return new WordStorage(builder) {
      @Override
      char caseFold(char c) {
        return Dictionary.this.caseFold(c);
      }
    };
  }
  /**
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SortingStrategy.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SortingStrategy.java
@ -0,0 +1,181 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.hunspell;
 import java.io.Closeable;
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.List;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefComparator;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.OfflineSorter;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
 /**
 * The strategy defining how a Hunspell dictionary should be loaded, with different tradeoffs. The
 * entries should be sorted in a special way, and this can be done either in-memory (faster, but
 * temporarily allocating more memory) or using disk (slower, but not needing much memory).
 *
 * @see #offline(Directory, String)
 * @see #inMemory()
 */
 public abstract class SortingStrategy {
  abstract EntryAccumulator start() throws IOException;
  interface EntryAccumulator {
    void addEntry(String entry) throws IOException;
    EntrySupplier finishAndSort() throws IOException;
  }
  interface EntrySupplier extends Closeable {
    int wordCount();
    /** The next line or {@code null} if the end is reached */
    String next() throws IOException;
  }
  /**
   * An "offline" strategy that creates temporary files in the given directory and uses them for
   * sorting with {@link OfflineSorter}. It's slower than {@link #inMemory()}, but doesn't need to
   * load the entire dictionary into memory.
   */
  public static SortingStrategy offline(Directory tempDir, String tempFileNamePrefix) {
    return new SortingStrategy() {
      @Override
      EntryAccumulator start() throws IOException {
        IndexOutput output = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
        ByteSequencesWriter writer = new ByteSequencesWriter(output);
        return new EntryAccumulator() {
          int wordCount = 0;
          @Override
          public void addEntry(String entry) throws IOException {
            wordCount++;
            writer.write(entry.getBytes(StandardCharsets.UTF_8));
          }
          @Override
          public EntrySupplier finishAndSort() throws IOException {
            CodecUtil.writeFooter(output);
            writer.close();
            String sortedFile = sortWordsOffline();
            ByteSequencesReader reader =
                new ByteSequencesReader(tempDir.openChecksumInput(sortedFile), sortedFile);
            return new EntrySupplier() {
              boolean success = false;
              @Override
              public int wordCount() {
                return wordCount;
              }
              @Override
              public String next() throws IOException {
                BytesRef scratch = reader.next();
                if (scratch == null) {
                  success = true;
                  return null;
                }
                return scratch.utf8ToString();
              }
              @Override
              public void close() throws IOException {
                reader.close();
                if (success) {
                  tempDir.deleteFile(sortedFile);
                } else {
                  IOUtils.deleteFilesIgnoringExceptions(tempDir, sortedFile);
                }
              }
            };
          }
          private String sortWordsOffline() throws IOException {
            var sorter = new OfflineSorter(tempDir, tempFileNamePrefix, BytesRefComparator.NATURAL);
            String sorted;
            boolean success = false;
            try {
              sorted = sorter.sort(output.getName());
              success = true;
            } finally {
              if (success) {
                tempDir.deleteFile(output.getName());
              } else {
                IOUtils.deleteFilesIgnoringExceptions(tempDir, output.getName());
              }
            }
            return sorted;
          }
        };
      }
    };
  }
  /**
   * The strategy that loads all entries as {@link String} objects and sorts them in memory. The
   * entries are then stored in a more compressed way, and the strings are gc-ed, but the loading
   * itself needs {@code O(dictionary_size)} memory.
   */
  public static SortingStrategy inMemory() {
    return new SortingStrategy() {
      @Override
      EntryAccumulator start() {
        List<String> entries = new ArrayList<>();
        return new EntryAccumulator() {
          @Override
          public void addEntry(String entry) {
            entries.add(entry);
          }
          @Override
          public EntrySupplier finishAndSort() {
            entries.sort(Comparator.naturalOrder());
            return new EntrySupplier() {
              int i = 0;
              @Override
              public int wordCount() {
                return entries.size();
              }
              @Override
              public String next() {
                return i < entries.size() ? entries.get(i++) : null;
              }
              @Override
              public void close() {}
            };
          }
        };
      }
    };
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
@ -41,7 +41,6 @@ import java.util.concurrent.atomic.AtomicLong;
 import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.lucene.tests.store.BaseDirectoryWrapper;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks;
 import org.apache.lucene.tests.util.RamUsageTester;
@ -72,9 +71,8 @@ public class TestAllDictionaries extends LuceneTestCase {
    Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic");
    assert Files.exists(dic) : dic;
    try (InputStream dictionary = Files.newInputStream(dic);
-        InputStream affix = Files.newInputStream(aff);
+        InputStream affix = Files.newInputStream(aff)) {
-        BaseDirectoryWrapper tempDir = newDirectory()) {
+      return new Dictionary(affix, List.of(dictionary), false, SortingStrategy.inMemory()) {
      return new Dictionary(tempDir, "dictionary", affix, dictionary) {
        @Override
        protected boolean tolerateAffixRuleCountMismatches() {
          return true;
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
@ -256,15 +256,22 @@ public class TestSpellChecking extends LuceneTestCase {
  }
  static void checkSpellCheckerExpectations(Path basePath) throws IOException, ParseException {
-    InputStream affixStream = Files.newInputStream(Path.of(basePath.toString() + ".aff"));
+    checkSpellCheckerExpectations(
        basePath, SortingStrategy.offline(new ByteBuffersDirectory(), "dictionary"));
    checkSpellCheckerExpectations(basePath, SortingStrategy.inMemory());
  }
  private static void checkSpellCheckerExpectations(Path basePath, SortingStrategy strategy)
      throws IOException, ParseException {
    Path affFile = Path.of(basePath + ".aff");
    Path dicFile = Path.of(basePath + ".dic");
    InputStream affixStream = Files.newInputStream(affFile);
    InputStream dictStream = Files.newInputStream(dicFile);
    Hunspell speller;
    Map<String, Suggester> suggesters = new LinkedHashMap<>();
    try {
-      Dictionary dictionary =
+      Dictionary dictionary = new Dictionary(affixStream, List.of(dictStream), false, strategy);
          new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
      speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
      Suggester suggester = new Suggester(dictionary);
      suggesters.put("default", suggester);