hunspell: allow in-memory entry sorting for faster dictionary loading (#12834)

* hunspell: allow in-memory entry sorting for faster dictionary loading

Co-authored-by: Dawid Weiss <dawid.weiss@gmail.com>
This commit is contained in:
Peter Gromov 2023-11-24 08:21:43 +01:00 committed by GitHub
parent 981339be04
commit f460d612b5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 297 additions and 145 deletions

View File

@ -88,7 +88,8 @@ Optimizations
* GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance (Peter Gromov)
* GITHUB#12825: Hunspell: improved dictionary loading performance (Peter Gromov)
* GITHUB#12825, GITHUB#12834: Hunspell: improved dictionary loading performance, allowed in-memory entry sorting.
(Peter Gromov)
* GITHUB#12372: Reduce allocation during HNSW construction (Jonathan Ellis)

View File

@ -50,19 +50,12 @@ import java.util.Set;
import java.util.TreeMap;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.analysis.hunspell.SortingStrategy.EntryAccumulator;
import org.apache.lucene.analysis.hunspell.SortingStrategy.EntrySupplier;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefComparator;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.IntSequenceOutputs;
@ -216,6 +209,25 @@ public class Dictionary {
List<InputStream> dictionaries,
boolean ignoreCase)
throws IOException, ParseException {
this(affix, dictionaries, ignoreCase, SortingStrategy.offline(tempDir, tempFileNamePrefix));
}
/**
* Creates a new Dictionary containing the information read from the provided InputStreams to
* hunspell affix and dictionary files. You have to close the provided InputStreams yourself.
*
* @param affix InputStream for reading the hunspell affix file (won't be closed).
* @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
* @param sortingStrategy the entry strategy for the dictionary loading
* @throws IOException Can be thrown while reading from the InputStreams
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/
public Dictionary(
InputStream affix,
List<InputStream> dictionaries,
boolean ignoreCase,
SortingStrategy sortingStrategy)
throws IOException, ParseException {
this.ignoreCase = ignoreCase;
try (BufferedInputStream affixStream =
@ -251,10 +263,11 @@ public class Dictionary {
readAffixFile(affixStream, decoder, flagEnumerator);
// read dictionary entries
IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
int wordCount = mergeDictionaries(dictionaries, decoder, unsorted);
String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator, wordCount);
EntryAccumulator acc = sortingStrategy.start();
mergeDictionaries(dictionaries, decoder, acc);
try (EntrySupplier sorted = acc.finishAndSort()) {
words = readSortedDictionaries(flagEnumerator, sorted);
}
flagLookup = flagEnumerator.finish();
aliases = null; // no longer needed
morphAliases = null; // no longer needed
@ -986,52 +999,43 @@ public class Dictionary {
}
}
private int mergeDictionaries(
List<InputStream> dictionaries, CharsetDecoder decoder, IndexOutput output)
private void mergeDictionaries(
List<InputStream> dictionaries, CharsetDecoder decoder, EntryAccumulator acc)
throws IOException {
StringBuilder sb = new StringBuilder();
int wordCount = 0;
try (ByteSequencesWriter writer = new ByteSequencesWriter(output)) {
for (InputStream dictionary : dictionaries) {
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
lines.readLine(); // first line is number of entries (approximately, sometimes)
for (InputStream dictionary : dictionaries) {
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
lines.readLine(); // first line is number of entries (approximately, sometimes)
String line;
while ((line = lines.readLine()) != null) {
// wild and unpredictable code comment rules
if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') {
continue;
}
line = unescapeEntry(line);
// if we haven't seen any custom morphological data, try to parse one
if (!hasCustomMorphData) {
int morphStart = line.indexOf(MORPH_SEPARATOR);
if (morphStart >= 0) {
String data = line.substring(morphStart + 1);
hasCustomMorphData =
splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
}
}
wordCount += writeNormalizedWordEntry(sb, writer, line);
String line;
while ((line = lines.readLine()) != null) {
// wild and unpredictable code comment rules
if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') {
continue;
}
line = unescapeEntry(line);
// if we haven't seen any custom morphological data, try to parse one
if (!hasCustomMorphData) {
int morphStart = line.indexOf(MORPH_SEPARATOR);
if (morphStart >= 0) {
String data = line.substring(morphStart + 1);
hasCustomMorphData = splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
}
}
writeNormalizedWordEntry(sb, line, acc);
}
CodecUtil.writeFooter(output);
}
return wordCount;
}
/**
* @return the number of word entries written
*/
private int writeNormalizedWordEntry(StringBuilder reuse, ByteSequencesWriter writer, String line)
private void writeNormalizedWordEntry(StringBuilder reuse, String line, EntryAccumulator acc)
throws IOException {
int flagSep = line.indexOf(FLAG_SEPARATOR);
int morphSep = line.indexOf(MORPH_SEPARATOR);
assert morphSep > 0;
assert morphSep > flagSep;
int sep = flagSep < 0 ? morphSep : flagSep;
if (sep == 0) return 0;
if (sep == 0) return;
CharSequence toWrite;
String beforeSep = line.substring(0, sep);
@ -1045,19 +1049,16 @@ public class Dictionary {
String written = toWrite.toString();
sep = written.length() - (line.length() - sep);
writer.write(written.getBytes(StandardCharsets.UTF_8));
acc.addEntry(written);
WordCase wordCase = WordCase.caseOf(written, sep);
if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep));
return 2;
addHiddenCapitalizedWord(reuse, acc, written.substring(0, sep), written.substring(sep));
}
return 1;
}
private void addHiddenCapitalizedWord(
StringBuilder reuse, ByteSequencesWriter writer, String word, String afterSep)
throws IOException {
StringBuilder reuse, EntryAccumulator acc, String word, String afterSep) throws IOException {
reuse.setLength(0);
reuse.append(Character.toUpperCase(word.charAt(0)));
for (int i = 1; i < word.length(); i++) {
@ -1066,7 +1067,7 @@ public class Dictionary {
reuse.append(FLAG_SEPARATOR);
reuse.append(HIDDEN_FLAG);
reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
acc.addEntry(reuse.toString());
}
String toLowerCase(String word) {
@ -1086,102 +1087,66 @@ public class Dictionary {
return new String(chars);
}
private String sortWordsOffline(
Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
var sorter = new OfflineSorter(tempDir, tempFileNamePrefix, BytesRefComparator.NATURAL);
String sorted;
boolean success = false;
try {
sorted = sorter.sort(unsorted.getName());
success = true;
} finally {
if (success) {
tempDir.deleteFile(unsorted.getName());
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
}
}
return sorted;
}
private WordStorage readSortedDictionaries(
Directory tempDir, String sorted, FlagEnumerator flags, int wordCount) throws IOException {
boolean success = false;
private WordStorage readSortedDictionaries(FlagEnumerator flags, EntrySupplier sorted)
throws IOException {
Map<String, Integer> morphIndices = new HashMap<>();
WordStorage.Builder builder =
new WordStorage.Builder(
wordCount, hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags());
sorted.wordCount(), hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags());
try (ByteSequencesReader reader =
new ByteSequencesReader(tempDir.openChecksumInput(sorted), sorted)) {
// TODO: the flags themselves can be double-chars (long) or also numeric
// either way the trick is to encode them as char... but they must be parsed differently
// TODO: the flags themselves can be double-chars (long) or also numeric
// either way the trick is to encode them as char... but they must be parsed differently
while (true) {
String line = sorted.next();
if (line == null) break;
while (true) {
BytesRef scratch = reader.next();
if (scratch == null) {
break;
}
String entry;
char[] wordForm;
int end;
String line = scratch.utf8ToString();
String entry;
char[] wordForm;
int end;
int flagSep = line.indexOf(FLAG_SEPARATOR);
if (flagSep == -1) {
wordForm = NOFLAGS;
end = line.indexOf(MORPH_SEPARATOR);
entry = line.substring(0, end);
} else {
end = line.indexOf(MORPH_SEPARATOR);
boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip();
if (aliasCount > 0 && !flagPart.isEmpty()) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
wordForm = flagParsingStrategy.parseFlags(flagPart);
if (hidden) {
wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1);
wordForm[wordForm.length - 1] = HIDDEN_FLAG;
}
entry = line.substring(0, flagSep);
}
if (entry.isEmpty()) continue;
int morphDataID = 0;
if (end + 1 < line.length()) {
List<String> morphFields = readMorphFields(entry, line.substring(end + 1));
if (!morphFields.isEmpty()) {
morphFields.sort(Comparator.naturalOrder());
morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields));
}
}
builder.add(entry, wordForm, morphDataID);
}
// finalize last entry
success = true;
return new WordStorage(builder) {
@Override
char caseFold(char c) {
return Dictionary.this.caseFold(c);
}
};
} finally {
if (success) {
tempDir.deleteFile(sorted);
int flagSep = line.indexOf(FLAG_SEPARATOR);
if (flagSep == -1) {
wordForm = NOFLAGS;
end = line.indexOf(MORPH_SEPARATOR);
entry = line.substring(0, end);
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
end = line.indexOf(MORPH_SEPARATOR);
boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip();
if (aliasCount > 0 && !flagPart.isEmpty()) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
wordForm = flagParsingStrategy.parseFlags(flagPart);
if (hidden) {
wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1);
wordForm[wordForm.length - 1] = HIDDEN_FLAG;
}
entry = line.substring(0, flagSep);
}
if (entry.isEmpty()) continue;
int morphDataID = 0;
if (end + 1 < line.length()) {
List<String> morphFields = readMorphFields(entry, line.substring(end + 1));
if (!morphFields.isEmpty()) {
morphFields.sort(Comparator.naturalOrder());
morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields));
}
}
builder.add(entry, wordForm, morphDataID);
}
return new WordStorage(builder) {
@Override
char caseFold(char c) {
return Dictionary.this.caseFold(c);
}
};
}
/**

View File

@ -0,0 +1,181 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import java.io.Closeable;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefComparator;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
/**
* The strategy defining how a Hunspell dictionary should be loaded, with different tradeoffs. The
* entries should be sorted in a special way, and this can be done either in-memory (faster, but
* temporarily allocating more memory) or using disk (slower, but not needing much memory).
*
* @see #offline(Directory, String)
* @see #inMemory()
*/
public abstract class SortingStrategy {
abstract EntryAccumulator start() throws IOException;
interface EntryAccumulator {
void addEntry(String entry) throws IOException;
EntrySupplier finishAndSort() throws IOException;
}
interface EntrySupplier extends Closeable {
int wordCount();
/** The next line or {@code null} if the end is reached */
String next() throws IOException;
}
/**
* An "offline" strategy that creates temporary files in the given directory and uses them for
* sorting with {@link OfflineSorter}. It's slower than {@link #inMemory()}, but doesn't need to
* load the entire dictionary into memory.
*/
public static SortingStrategy offline(Directory tempDir, String tempFileNamePrefix) {
return new SortingStrategy() {
@Override
EntryAccumulator start() throws IOException {
IndexOutput output = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
ByteSequencesWriter writer = new ByteSequencesWriter(output);
return new EntryAccumulator() {
int wordCount = 0;
@Override
public void addEntry(String entry) throws IOException {
wordCount++;
writer.write(entry.getBytes(StandardCharsets.UTF_8));
}
@Override
public EntrySupplier finishAndSort() throws IOException {
CodecUtil.writeFooter(output);
writer.close();
String sortedFile = sortWordsOffline();
ByteSequencesReader reader =
new ByteSequencesReader(tempDir.openChecksumInput(sortedFile), sortedFile);
return new EntrySupplier() {
boolean success = false;
@Override
public int wordCount() {
return wordCount;
}
@Override
public String next() throws IOException {
BytesRef scratch = reader.next();
if (scratch == null) {
success = true;
return null;
}
return scratch.utf8ToString();
}
@Override
public void close() throws IOException {
reader.close();
if (success) {
tempDir.deleteFile(sortedFile);
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, sortedFile);
}
}
};
}
private String sortWordsOffline() throws IOException {
var sorter = new OfflineSorter(tempDir, tempFileNamePrefix, BytesRefComparator.NATURAL);
String sorted;
boolean success = false;
try {
sorted = sorter.sort(output.getName());
success = true;
} finally {
if (success) {
tempDir.deleteFile(output.getName());
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, output.getName());
}
}
return sorted;
}
};
}
};
}
/**
* The strategy that loads all entries as {@link String} objects and sorts them in memory. The
* entries are then stored in a more compressed way, and the strings are gc-ed, but the loading
* itself needs {@code O(dictionary_size)} memory.
*/
public static SortingStrategy inMemory() {
return new SortingStrategy() {
@Override
EntryAccumulator start() {
List<String> entries = new ArrayList<>();
return new EntryAccumulator() {
@Override
public void addEntry(String entry) {
entries.add(entry);
}
@Override
public EntrySupplier finishAndSort() {
entries.sort(Comparator.naturalOrder());
return new EntrySupplier() {
int i = 0;
@Override
public int wordCount() {
return entries.size();
}
@Override
public String next() {
return i < entries.size() ? entries.get(i++) : null;
}
@Override
public void close() {}
};
}
};
}
};
}
}

View File

@ -41,7 +41,6 @@ import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.tests.store.BaseDirectoryWrapper;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks;
import org.apache.lucene.tests.util.RamUsageTester;
@ -72,9 +71,8 @@ public class TestAllDictionaries extends LuceneTestCase {
Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic");
assert Files.exists(dic) : dic;
try (InputStream dictionary = Files.newInputStream(dic);
InputStream affix = Files.newInputStream(aff);
BaseDirectoryWrapper tempDir = newDirectory()) {
return new Dictionary(tempDir, "dictionary", affix, dictionary) {
InputStream affix = Files.newInputStream(aff)) {
return new Dictionary(affix, List.of(dictionary), false, SortingStrategy.inMemory()) {
@Override
protected boolean tolerateAffixRuleCountMismatches() {
return true;

View File

@ -256,15 +256,22 @@ public class TestSpellChecking extends LuceneTestCase {
}
static void checkSpellCheckerExpectations(Path basePath) throws IOException, ParseException {
InputStream affixStream = Files.newInputStream(Path.of(basePath.toString() + ".aff"));
checkSpellCheckerExpectations(
basePath, SortingStrategy.offline(new ByteBuffersDirectory(), "dictionary"));
checkSpellCheckerExpectations(basePath, SortingStrategy.inMemory());
}
private static void checkSpellCheckerExpectations(Path basePath, SortingStrategy strategy)
throws IOException, ParseException {
Path affFile = Path.of(basePath + ".aff");
Path dicFile = Path.of(basePath + ".dic");
InputStream affixStream = Files.newInputStream(affFile);
InputStream dictStream = Files.newInputStream(dicFile);
Hunspell speller;
Map<String, Suggester> suggesters = new LinkedHashMap<>();
try {
Dictionary dictionary =
new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
Dictionary dictionary = new Dictionary(affixStream, List.of(dictStream), false, strategy);
speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
Suggester suggester = new Suggester(dictionary);
suggesters.put("default", suggester);