mirror of https://github.com/apache/lucene.git
hunspell: allow in-memory entry sorting for faster dictionary loading (#12834)
* hunspell: allow in-memory entry sorting for faster dictionary loading Co-authored-by: Dawid Weiss <dawid.weiss@gmail.com>
This commit is contained in:
parent
981339be04
commit
f460d612b5
|
@ -88,7 +88,8 @@ Optimizations
|
|||
|
||||
* GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance (Peter Gromov)
|
||||
|
||||
* GITHUB#12825: Hunspell: improved dictionary loading performance (Peter Gromov)
|
||||
* GITHUB#12825, GITHUB#12834: Hunspell: improved dictionary loading performance, allowed in-memory entry sorting.
|
||||
(Peter Gromov)
|
||||
|
||||
* GITHUB#12372: Reduce allocation during HNSW construction (Jonathan Ellis)
|
||||
|
||||
|
|
|
@ -50,19 +50,12 @@ import java.util.Set;
|
|||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.analysis.hunspell.SortingStrategy.EntryAccumulator;
|
||||
import org.apache.lucene.analysis.hunspell.SortingStrategy.EntrySupplier;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefComparator;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.OfflineSorter;
|
||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
|
||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.IntSequenceOutputs;
|
||||
|
@ -216,6 +209,25 @@ public class Dictionary {
|
|||
List<InputStream> dictionaries,
|
||||
boolean ignoreCase)
|
||||
throws IOException, ParseException {
|
||||
this(affix, dictionaries, ignoreCase, SortingStrategy.offline(tempDir, tempFileNamePrefix));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new Dictionary containing the information read from the provided InputStreams to
|
||||
* hunspell affix and dictionary files. You have to close the provided InputStreams yourself.
|
||||
*
|
||||
* @param affix InputStream for reading the hunspell affix file (won't be closed).
|
||||
* @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
|
||||
* @param sortingStrategy the entry strategy for the dictionary loading
|
||||
* @throws IOException Can be thrown while reading from the InputStreams
|
||||
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
||||
*/
|
||||
public Dictionary(
|
||||
InputStream affix,
|
||||
List<InputStream> dictionaries,
|
||||
boolean ignoreCase,
|
||||
SortingStrategy sortingStrategy)
|
||||
throws IOException, ParseException {
|
||||
this.ignoreCase = ignoreCase;
|
||||
|
||||
try (BufferedInputStream affixStream =
|
||||
|
@ -251,10 +263,11 @@ public class Dictionary {
|
|||
readAffixFile(affixStream, decoder, flagEnumerator);
|
||||
|
||||
// read dictionary entries
|
||||
IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
|
||||
int wordCount = mergeDictionaries(dictionaries, decoder, unsorted);
|
||||
String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
|
||||
words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator, wordCount);
|
||||
EntryAccumulator acc = sortingStrategy.start();
|
||||
mergeDictionaries(dictionaries, decoder, acc);
|
||||
try (EntrySupplier sorted = acc.finishAndSort()) {
|
||||
words = readSortedDictionaries(flagEnumerator, sorted);
|
||||
}
|
||||
flagLookup = flagEnumerator.finish();
|
||||
aliases = null; // no longer needed
|
||||
morphAliases = null; // no longer needed
|
||||
|
@ -986,12 +999,10 @@ public class Dictionary {
|
|||
}
|
||||
}
|
||||
|
||||
private int mergeDictionaries(
|
||||
List<InputStream> dictionaries, CharsetDecoder decoder, IndexOutput output)
|
||||
private void mergeDictionaries(
|
||||
List<InputStream> dictionaries, CharsetDecoder decoder, EntryAccumulator acc)
|
||||
throws IOException {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int wordCount = 0;
|
||||
try (ByteSequencesWriter writer = new ByteSequencesWriter(output)) {
|
||||
for (InputStream dictionary : dictionaries) {
|
||||
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
|
||||
lines.readLine(); // first line is number of entries (approximately, sometimes)
|
||||
|
@ -1008,30 +1019,23 @@ public class Dictionary {
|
|||
int morphStart = line.indexOf(MORPH_SEPARATOR);
|
||||
if (morphStart >= 0) {
|
||||
String data = line.substring(morphStart + 1);
|
||||
hasCustomMorphData =
|
||||
splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
|
||||
hasCustomMorphData = splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
|
||||
}
|
||||
}
|
||||
|
||||
wordCount += writeNormalizedWordEntry(sb, writer, line);
|
||||
writeNormalizedWordEntry(sb, line, acc);
|
||||
}
|
||||
}
|
||||
CodecUtil.writeFooter(output);
|
||||
}
|
||||
return wordCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of word entries written
|
||||
*/
|
||||
private int writeNormalizedWordEntry(StringBuilder reuse, ByteSequencesWriter writer, String line)
|
||||
private void writeNormalizedWordEntry(StringBuilder reuse, String line, EntryAccumulator acc)
|
||||
throws IOException {
|
||||
int flagSep = line.indexOf(FLAG_SEPARATOR);
|
||||
int morphSep = line.indexOf(MORPH_SEPARATOR);
|
||||
assert morphSep > 0;
|
||||
assert morphSep > flagSep;
|
||||
int sep = flagSep < 0 ? morphSep : flagSep;
|
||||
if (sep == 0) return 0;
|
||||
if (sep == 0) return;
|
||||
|
||||
CharSequence toWrite;
|
||||
String beforeSep = line.substring(0, sep);
|
||||
|
@ -1045,19 +1049,16 @@ public class Dictionary {
|
|||
|
||||
String written = toWrite.toString();
|
||||
sep = written.length() - (line.length() - sep);
|
||||
writer.write(written.getBytes(StandardCharsets.UTF_8));
|
||||
acc.addEntry(written);
|
||||
|
||||
WordCase wordCase = WordCase.caseOf(written, sep);
|
||||
if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
|
||||
addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep));
|
||||
return 2;
|
||||
addHiddenCapitalizedWord(reuse, acc, written.substring(0, sep), written.substring(sep));
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
private void addHiddenCapitalizedWord(
|
||||
StringBuilder reuse, ByteSequencesWriter writer, String word, String afterSep)
|
||||
throws IOException {
|
||||
StringBuilder reuse, EntryAccumulator acc, String word, String afterSep) throws IOException {
|
||||
reuse.setLength(0);
|
||||
reuse.append(Character.toUpperCase(word.charAt(0)));
|
||||
for (int i = 1; i < word.length(); i++) {
|
||||
|
@ -1066,7 +1067,7 @@ public class Dictionary {
|
|||
reuse.append(FLAG_SEPARATOR);
|
||||
reuse.append(HIDDEN_FLAG);
|
||||
reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
|
||||
writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
|
||||
acc.addEntry(reuse.toString());
|
||||
}
|
||||
|
||||
String toLowerCase(String word) {
|
||||
|
@ -1086,48 +1087,21 @@ public class Dictionary {
|
|||
return new String(chars);
|
||||
}
|
||||
|
||||
private String sortWordsOffline(
|
||||
Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
|
||||
var sorter = new OfflineSorter(tempDir, tempFileNamePrefix, BytesRefComparator.NATURAL);
|
||||
|
||||
String sorted;
|
||||
boolean success = false;
|
||||
try {
|
||||
sorted = sorter.sort(unsorted.getName());
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
tempDir.deleteFile(unsorted.getName());
|
||||
} else {
|
||||
IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
|
||||
}
|
||||
}
|
||||
return sorted;
|
||||
}
|
||||
|
||||
private WordStorage readSortedDictionaries(
|
||||
Directory tempDir, String sorted, FlagEnumerator flags, int wordCount) throws IOException {
|
||||
boolean success = false;
|
||||
|
||||
private WordStorage readSortedDictionaries(FlagEnumerator flags, EntrySupplier sorted)
|
||||
throws IOException {
|
||||
Map<String, Integer> morphIndices = new HashMap<>();
|
||||
|
||||
WordStorage.Builder builder =
|
||||
new WordStorage.Builder(
|
||||
wordCount, hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags());
|
||||
|
||||
try (ByteSequencesReader reader =
|
||||
new ByteSequencesReader(tempDir.openChecksumInput(sorted), sorted)) {
|
||||
sorted.wordCount(), hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags());
|
||||
|
||||
// TODO: the flags themselves can be double-chars (long) or also numeric
|
||||
// either way the trick is to encode them as char... but they must be parsed differently
|
||||
|
||||
while (true) {
|
||||
BytesRef scratch = reader.next();
|
||||
if (scratch == null) {
|
||||
break;
|
||||
}
|
||||
String line = sorted.next();
|
||||
if (line == null) break;
|
||||
|
||||
String line = scratch.utf8ToString();
|
||||
String entry;
|
||||
char[] wordForm;
|
||||
int end;
|
||||
|
@ -1167,21 +1141,12 @@ public class Dictionary {
|
|||
builder.add(entry, wordForm, morphDataID);
|
||||
}
|
||||
|
||||
// finalize last entry
|
||||
success = true;
|
||||
return new WordStorage(builder) {
|
||||
@Override
|
||||
char caseFold(char c) {
|
||||
return Dictionary.this.caseFold(c);
|
||||
}
|
||||
};
|
||||
} finally {
|
||||
if (success) {
|
||||
tempDir.deleteFile(sorted);
|
||||
} else {
|
||||
IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -0,0 +1,181 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefComparator;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.OfflineSorter;
|
||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
|
||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
||||
|
||||
/**
|
||||
* The strategy defining how a Hunspell dictionary should be loaded, with different tradeoffs. The
|
||||
* entries should be sorted in a special way, and this can be done either in-memory (faster, but
|
||||
* temporarily allocating more memory) or using disk (slower, but not needing much memory).
|
||||
*
|
||||
* @see #offline(Directory, String)
|
||||
* @see #inMemory()
|
||||
*/
|
||||
public abstract class SortingStrategy {
|
||||
|
||||
abstract EntryAccumulator start() throws IOException;
|
||||
|
||||
interface EntryAccumulator {
|
||||
|
||||
void addEntry(String entry) throws IOException;
|
||||
|
||||
EntrySupplier finishAndSort() throws IOException;
|
||||
}
|
||||
|
||||
interface EntrySupplier extends Closeable {
|
||||
int wordCount();
|
||||
|
||||
/** The next line or {@code null} if the end is reached */
|
||||
String next() throws IOException;
|
||||
}
|
||||
|
||||
/**
|
||||
* An "offline" strategy that creates temporary files in the given directory and uses them for
|
||||
* sorting with {@link OfflineSorter}. It's slower than {@link #inMemory()}, but doesn't need to
|
||||
* load the entire dictionary into memory.
|
||||
*/
|
||||
public static SortingStrategy offline(Directory tempDir, String tempFileNamePrefix) {
|
||||
return new SortingStrategy() {
|
||||
@Override
|
||||
EntryAccumulator start() throws IOException {
|
||||
IndexOutput output = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
|
||||
ByteSequencesWriter writer = new ByteSequencesWriter(output);
|
||||
return new EntryAccumulator() {
|
||||
int wordCount = 0;
|
||||
|
||||
@Override
|
||||
public void addEntry(String entry) throws IOException {
|
||||
wordCount++;
|
||||
writer.write(entry.getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
@Override
|
||||
public EntrySupplier finishAndSort() throws IOException {
|
||||
CodecUtil.writeFooter(output);
|
||||
writer.close();
|
||||
String sortedFile = sortWordsOffline();
|
||||
ByteSequencesReader reader =
|
||||
new ByteSequencesReader(tempDir.openChecksumInput(sortedFile), sortedFile);
|
||||
return new EntrySupplier() {
|
||||
boolean success = false;
|
||||
|
||||
@Override
|
||||
public int wordCount() {
|
||||
return wordCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() throws IOException {
|
||||
BytesRef scratch = reader.next();
|
||||
if (scratch == null) {
|
||||
success = true;
|
||||
return null;
|
||||
}
|
||||
return scratch.utf8ToString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
reader.close();
|
||||
if (success) {
|
||||
tempDir.deleteFile(sortedFile);
|
||||
} else {
|
||||
IOUtils.deleteFilesIgnoringExceptions(tempDir, sortedFile);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private String sortWordsOffline() throws IOException {
|
||||
var sorter = new OfflineSorter(tempDir, tempFileNamePrefix, BytesRefComparator.NATURAL);
|
||||
|
||||
String sorted;
|
||||
boolean success = false;
|
||||
try {
|
||||
sorted = sorter.sort(output.getName());
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
tempDir.deleteFile(output.getName());
|
||||
} else {
|
||||
IOUtils.deleteFilesIgnoringExceptions(tempDir, output.getName());
|
||||
}
|
||||
}
|
||||
return sorted;
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* The strategy that loads all entries as {@link String} objects and sorts them in memory. The
|
||||
* entries are then stored in a more compressed way, and the strings are gc-ed, but the loading
|
||||
* itself needs {@code O(dictionary_size)} memory.
|
||||
*/
|
||||
public static SortingStrategy inMemory() {
|
||||
return new SortingStrategy() {
|
||||
@Override
|
||||
EntryAccumulator start() {
|
||||
List<String> entries = new ArrayList<>();
|
||||
return new EntryAccumulator() {
|
||||
@Override
|
||||
public void addEntry(String entry) {
|
||||
entries.add(entry);
|
||||
}
|
||||
|
||||
@Override
|
||||
public EntrySupplier finishAndSort() {
|
||||
entries.sort(Comparator.naturalOrder());
|
||||
return new EntrySupplier() {
|
||||
int i = 0;
|
||||
|
||||
@Override
|
||||
public int wordCount() {
|
||||
return entries.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
return i < entries.size() ? entries.get(i++) : null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
|
@ -41,7 +41,6 @@ import java.util.concurrent.atomic.AtomicLong;
|
|||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.lucene.tests.store.BaseDirectoryWrapper;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks;
|
||||
import org.apache.lucene.tests.util.RamUsageTester;
|
||||
|
@ -72,9 +71,8 @@ public class TestAllDictionaries extends LuceneTestCase {
|
|||
Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic");
|
||||
assert Files.exists(dic) : dic;
|
||||
try (InputStream dictionary = Files.newInputStream(dic);
|
||||
InputStream affix = Files.newInputStream(aff);
|
||||
BaseDirectoryWrapper tempDir = newDirectory()) {
|
||||
return new Dictionary(tempDir, "dictionary", affix, dictionary) {
|
||||
InputStream affix = Files.newInputStream(aff)) {
|
||||
return new Dictionary(affix, List.of(dictionary), false, SortingStrategy.inMemory()) {
|
||||
@Override
|
||||
protected boolean tolerateAffixRuleCountMismatches() {
|
||||
return true;
|
||||
|
|
|
@ -256,15 +256,22 @@ public class TestSpellChecking extends LuceneTestCase {
|
|||
}
|
||||
|
||||
static void checkSpellCheckerExpectations(Path basePath) throws IOException, ParseException {
|
||||
InputStream affixStream = Files.newInputStream(Path.of(basePath.toString() + ".aff"));
|
||||
checkSpellCheckerExpectations(
|
||||
basePath, SortingStrategy.offline(new ByteBuffersDirectory(), "dictionary"));
|
||||
checkSpellCheckerExpectations(basePath, SortingStrategy.inMemory());
|
||||
}
|
||||
|
||||
private static void checkSpellCheckerExpectations(Path basePath, SortingStrategy strategy)
|
||||
throws IOException, ParseException {
|
||||
Path affFile = Path.of(basePath + ".aff");
|
||||
Path dicFile = Path.of(basePath + ".dic");
|
||||
InputStream affixStream = Files.newInputStream(affFile);
|
||||
InputStream dictStream = Files.newInputStream(dicFile);
|
||||
|
||||
Hunspell speller;
|
||||
Map<String, Suggester> suggesters = new LinkedHashMap<>();
|
||||
try {
|
||||
Dictionary dictionary =
|
||||
new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
|
||||
Dictionary dictionary = new Dictionary(affixStream, List.of(dictStream), false, strategy);
|
||||
speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
|
||||
Suggester suggester = new Suggester(dictionary);
|
||||
suggesters.put("default", suggester);
|
||||
|
|
Loading…
Reference in New Issue