mirror of https://github.com/apache/lucene.git
hunspell: allow in-memory entry sorting for faster dictionary loading (#12834)
* hunspell: allow in-memory entry sorting for faster dictionary loading Co-authored-by: Dawid Weiss <dawid.weiss@gmail.com>
This commit is contained in:
parent
981339be04
commit
f460d612b5
|
@ -88,7 +88,8 @@ Optimizations
|
||||||
|
|
||||||
* GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance (Peter Gromov)
|
* GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance (Peter Gromov)
|
||||||
|
|
||||||
* GITHUB#12825: Hunspell: improved dictionary loading performance (Peter Gromov)
|
* GITHUB#12825, GITHUB#12834: Hunspell: improved dictionary loading performance, allowed in-memory entry sorting.
|
||||||
|
(Peter Gromov)
|
||||||
|
|
||||||
* GITHUB#12372: Reduce allocation during HNSW construction (Jonathan Ellis)
|
* GITHUB#12372: Reduce allocation during HNSW construction (Jonathan Ellis)
|
||||||
|
|
||||||
|
|
|
@ -50,19 +50,12 @@ import java.util.Set;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
import org.apache.lucene.codecs.CodecUtil;
|
import org.apache.lucene.analysis.hunspell.SortingStrategy.EntryAccumulator;
|
||||||
|
import org.apache.lucene.analysis.hunspell.SortingStrategy.EntrySupplier;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IOContext;
|
|
||||||
import org.apache.lucene.store.IndexOutput;
|
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
|
||||||
import org.apache.lucene.util.BytesRefComparator;
|
|
||||||
import org.apache.lucene.util.IOUtils;
|
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.OfflineSorter;
|
|
||||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
|
|
||||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.FSTCompiler;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.IntSequenceOutputs;
|
import org.apache.lucene.util.fst.IntSequenceOutputs;
|
||||||
|
@ -216,6 +209,25 @@ public class Dictionary {
|
||||||
List<InputStream> dictionaries,
|
List<InputStream> dictionaries,
|
||||||
boolean ignoreCase)
|
boolean ignoreCase)
|
||||||
throws IOException, ParseException {
|
throws IOException, ParseException {
|
||||||
|
this(affix, dictionaries, ignoreCase, SortingStrategy.offline(tempDir, tempFileNamePrefix));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new Dictionary containing the information read from the provided InputStreams to
|
||||||
|
* hunspell affix and dictionary files. You have to close the provided InputStreams yourself.
|
||||||
|
*
|
||||||
|
* @param affix InputStream for reading the hunspell affix file (won't be closed).
|
||||||
|
* @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
|
||||||
|
* @param sortingStrategy the entry strategy for the dictionary loading
|
||||||
|
* @throws IOException Can be thrown while reading from the InputStreams
|
||||||
|
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
||||||
|
*/
|
||||||
|
public Dictionary(
|
||||||
|
InputStream affix,
|
||||||
|
List<InputStream> dictionaries,
|
||||||
|
boolean ignoreCase,
|
||||||
|
SortingStrategy sortingStrategy)
|
||||||
|
throws IOException, ParseException {
|
||||||
this.ignoreCase = ignoreCase;
|
this.ignoreCase = ignoreCase;
|
||||||
|
|
||||||
try (BufferedInputStream affixStream =
|
try (BufferedInputStream affixStream =
|
||||||
|
@ -251,10 +263,11 @@ public class Dictionary {
|
||||||
readAffixFile(affixStream, decoder, flagEnumerator);
|
readAffixFile(affixStream, decoder, flagEnumerator);
|
||||||
|
|
||||||
// read dictionary entries
|
// read dictionary entries
|
||||||
IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
|
EntryAccumulator acc = sortingStrategy.start();
|
||||||
int wordCount = mergeDictionaries(dictionaries, decoder, unsorted);
|
mergeDictionaries(dictionaries, decoder, acc);
|
||||||
String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
|
try (EntrySupplier sorted = acc.finishAndSort()) {
|
||||||
words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator, wordCount);
|
words = readSortedDictionaries(flagEnumerator, sorted);
|
||||||
|
}
|
||||||
flagLookup = flagEnumerator.finish();
|
flagLookup = flagEnumerator.finish();
|
||||||
aliases = null; // no longer needed
|
aliases = null; // no longer needed
|
||||||
morphAliases = null; // no longer needed
|
morphAliases = null; // no longer needed
|
||||||
|
@ -986,52 +999,43 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private int mergeDictionaries(
|
private void mergeDictionaries(
|
||||||
List<InputStream> dictionaries, CharsetDecoder decoder, IndexOutput output)
|
List<InputStream> dictionaries, CharsetDecoder decoder, EntryAccumulator acc)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
int wordCount = 0;
|
for (InputStream dictionary : dictionaries) {
|
||||||
try (ByteSequencesWriter writer = new ByteSequencesWriter(output)) {
|
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
|
||||||
for (InputStream dictionary : dictionaries) {
|
lines.readLine(); // first line is number of entries (approximately, sometimes)
|
||||||
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
|
|
||||||
lines.readLine(); // first line is number of entries (approximately, sometimes)
|
|
||||||
|
|
||||||
String line;
|
String line;
|
||||||
while ((line = lines.readLine()) != null) {
|
while ((line = lines.readLine()) != null) {
|
||||||
// wild and unpredictable code comment rules
|
// wild and unpredictable code comment rules
|
||||||
if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') {
|
if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') {
|
||||||
continue;
|
continue;
|
||||||
}
|
|
||||||
line = unescapeEntry(line);
|
|
||||||
// if we haven't seen any custom morphological data, try to parse one
|
|
||||||
if (!hasCustomMorphData) {
|
|
||||||
int morphStart = line.indexOf(MORPH_SEPARATOR);
|
|
||||||
if (morphStart >= 0) {
|
|
||||||
String data = line.substring(morphStart + 1);
|
|
||||||
hasCustomMorphData =
|
|
||||||
splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
wordCount += writeNormalizedWordEntry(sb, writer, line);
|
|
||||||
}
|
}
|
||||||
|
line = unescapeEntry(line);
|
||||||
|
// if we haven't seen any custom morphological data, try to parse one
|
||||||
|
if (!hasCustomMorphData) {
|
||||||
|
int morphStart = line.indexOf(MORPH_SEPARATOR);
|
||||||
|
if (morphStart >= 0) {
|
||||||
|
String data = line.substring(morphStart + 1);
|
||||||
|
hasCustomMorphData = splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
writeNormalizedWordEntry(sb, line, acc);
|
||||||
}
|
}
|
||||||
CodecUtil.writeFooter(output);
|
|
||||||
}
|
}
|
||||||
return wordCount;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
private void writeNormalizedWordEntry(StringBuilder reuse, String line, EntryAccumulator acc)
|
||||||
* @return the number of word entries written
|
|
||||||
*/
|
|
||||||
private int writeNormalizedWordEntry(StringBuilder reuse, ByteSequencesWriter writer, String line)
|
|
||||||
throws IOException {
|
throws IOException {
|
||||||
int flagSep = line.indexOf(FLAG_SEPARATOR);
|
int flagSep = line.indexOf(FLAG_SEPARATOR);
|
||||||
int morphSep = line.indexOf(MORPH_SEPARATOR);
|
int morphSep = line.indexOf(MORPH_SEPARATOR);
|
||||||
assert morphSep > 0;
|
assert morphSep > 0;
|
||||||
assert morphSep > flagSep;
|
assert morphSep > flagSep;
|
||||||
int sep = flagSep < 0 ? morphSep : flagSep;
|
int sep = flagSep < 0 ? morphSep : flagSep;
|
||||||
if (sep == 0) return 0;
|
if (sep == 0) return;
|
||||||
|
|
||||||
CharSequence toWrite;
|
CharSequence toWrite;
|
||||||
String beforeSep = line.substring(0, sep);
|
String beforeSep = line.substring(0, sep);
|
||||||
|
@ -1045,19 +1049,16 @@ public class Dictionary {
|
||||||
|
|
||||||
String written = toWrite.toString();
|
String written = toWrite.toString();
|
||||||
sep = written.length() - (line.length() - sep);
|
sep = written.length() - (line.length() - sep);
|
||||||
writer.write(written.getBytes(StandardCharsets.UTF_8));
|
acc.addEntry(written);
|
||||||
|
|
||||||
WordCase wordCase = WordCase.caseOf(written, sep);
|
WordCase wordCase = WordCase.caseOf(written, sep);
|
||||||
if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
|
if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
|
||||||
addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep));
|
addHiddenCapitalizedWord(reuse, acc, written.substring(0, sep), written.substring(sep));
|
||||||
return 2;
|
|
||||||
}
|
}
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addHiddenCapitalizedWord(
|
private void addHiddenCapitalizedWord(
|
||||||
StringBuilder reuse, ByteSequencesWriter writer, String word, String afterSep)
|
StringBuilder reuse, EntryAccumulator acc, String word, String afterSep) throws IOException {
|
||||||
throws IOException {
|
|
||||||
reuse.setLength(0);
|
reuse.setLength(0);
|
||||||
reuse.append(Character.toUpperCase(word.charAt(0)));
|
reuse.append(Character.toUpperCase(word.charAt(0)));
|
||||||
for (int i = 1; i < word.length(); i++) {
|
for (int i = 1; i < word.length(); i++) {
|
||||||
|
@ -1066,7 +1067,7 @@ public class Dictionary {
|
||||||
reuse.append(FLAG_SEPARATOR);
|
reuse.append(FLAG_SEPARATOR);
|
||||||
reuse.append(HIDDEN_FLAG);
|
reuse.append(HIDDEN_FLAG);
|
||||||
reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
|
reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
|
||||||
writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
|
acc.addEntry(reuse.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
String toLowerCase(String word) {
|
String toLowerCase(String word) {
|
||||||
|
@ -1086,102 +1087,66 @@ public class Dictionary {
|
||||||
return new String(chars);
|
return new String(chars);
|
||||||
}
|
}
|
||||||
|
|
||||||
private String sortWordsOffline(
|
private WordStorage readSortedDictionaries(FlagEnumerator flags, EntrySupplier sorted)
|
||||||
Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
|
throws IOException {
|
||||||
var sorter = new OfflineSorter(tempDir, tempFileNamePrefix, BytesRefComparator.NATURAL);
|
|
||||||
|
|
||||||
String sorted;
|
|
||||||
boolean success = false;
|
|
||||||
try {
|
|
||||||
sorted = sorter.sort(unsorted.getName());
|
|
||||||
success = true;
|
|
||||||
} finally {
|
|
||||||
if (success) {
|
|
||||||
tempDir.deleteFile(unsorted.getName());
|
|
||||||
} else {
|
|
||||||
IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return sorted;
|
|
||||||
}
|
|
||||||
|
|
||||||
private WordStorage readSortedDictionaries(
|
|
||||||
Directory tempDir, String sorted, FlagEnumerator flags, int wordCount) throws IOException {
|
|
||||||
boolean success = false;
|
|
||||||
|
|
||||||
Map<String, Integer> morphIndices = new HashMap<>();
|
Map<String, Integer> morphIndices = new HashMap<>();
|
||||||
|
|
||||||
WordStorage.Builder builder =
|
WordStorage.Builder builder =
|
||||||
new WordStorage.Builder(
|
new WordStorage.Builder(
|
||||||
wordCount, hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags());
|
sorted.wordCount(), hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags());
|
||||||
|
|
||||||
try (ByteSequencesReader reader =
|
// TODO: the flags themselves can be double-chars (long) or also numeric
|
||||||
new ByteSequencesReader(tempDir.openChecksumInput(sorted), sorted)) {
|
// either way the trick is to encode them as char... but they must be parsed differently
|
||||||
|
|
||||||
// TODO: the flags themselves can be double-chars (long) or also numeric
|
while (true) {
|
||||||
// either way the trick is to encode them as char... but they must be parsed differently
|
String line = sorted.next();
|
||||||
|
if (line == null) break;
|
||||||
|
|
||||||
while (true) {
|
String entry;
|
||||||
BytesRef scratch = reader.next();
|
char[] wordForm;
|
||||||
if (scratch == null) {
|
int end;
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
String line = scratch.utf8ToString();
|
int flagSep = line.indexOf(FLAG_SEPARATOR);
|
||||||
String entry;
|
if (flagSep == -1) {
|
||||||
char[] wordForm;
|
wordForm = NOFLAGS;
|
||||||
int end;
|
end = line.indexOf(MORPH_SEPARATOR);
|
||||||
|
entry = line.substring(0, end);
|
||||||
int flagSep = line.indexOf(FLAG_SEPARATOR);
|
|
||||||
if (flagSep == -1) {
|
|
||||||
wordForm = NOFLAGS;
|
|
||||||
end = line.indexOf(MORPH_SEPARATOR);
|
|
||||||
entry = line.substring(0, end);
|
|
||||||
} else {
|
|
||||||
end = line.indexOf(MORPH_SEPARATOR);
|
|
||||||
boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
|
|
||||||
String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip();
|
|
||||||
if (aliasCount > 0 && !flagPart.isEmpty()) {
|
|
||||||
flagPart = getAliasValue(Integer.parseInt(flagPart));
|
|
||||||
}
|
|
||||||
|
|
||||||
wordForm = flagParsingStrategy.parseFlags(flagPart);
|
|
||||||
if (hidden) {
|
|
||||||
wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1);
|
|
||||||
wordForm[wordForm.length - 1] = HIDDEN_FLAG;
|
|
||||||
}
|
|
||||||
entry = line.substring(0, flagSep);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (entry.isEmpty()) continue;
|
|
||||||
|
|
||||||
int morphDataID = 0;
|
|
||||||
if (end + 1 < line.length()) {
|
|
||||||
List<String> morphFields = readMorphFields(entry, line.substring(end + 1));
|
|
||||||
if (!morphFields.isEmpty()) {
|
|
||||||
morphFields.sort(Comparator.naturalOrder());
|
|
||||||
morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
builder.add(entry, wordForm, morphDataID);
|
|
||||||
}
|
|
||||||
|
|
||||||
// finalize last entry
|
|
||||||
success = true;
|
|
||||||
return new WordStorage(builder) {
|
|
||||||
@Override
|
|
||||||
char caseFold(char c) {
|
|
||||||
return Dictionary.this.caseFold(c);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
} finally {
|
|
||||||
if (success) {
|
|
||||||
tempDir.deleteFile(sorted);
|
|
||||||
} else {
|
} else {
|
||||||
IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
|
end = line.indexOf(MORPH_SEPARATOR);
|
||||||
|
boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
|
||||||
|
String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip();
|
||||||
|
if (aliasCount > 0 && !flagPart.isEmpty()) {
|
||||||
|
flagPart = getAliasValue(Integer.parseInt(flagPart));
|
||||||
|
}
|
||||||
|
|
||||||
|
wordForm = flagParsingStrategy.parseFlags(flagPart);
|
||||||
|
if (hidden) {
|
||||||
|
wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1);
|
||||||
|
wordForm[wordForm.length - 1] = HIDDEN_FLAG;
|
||||||
|
}
|
||||||
|
entry = line.substring(0, flagSep);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (entry.isEmpty()) continue;
|
||||||
|
|
||||||
|
int morphDataID = 0;
|
||||||
|
if (end + 1 < line.length()) {
|
||||||
|
List<String> morphFields = readMorphFields(entry, line.substring(end + 1));
|
||||||
|
if (!morphFields.isEmpty()) {
|
||||||
|
morphFields.sort(Comparator.naturalOrder());
|
||||||
|
morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
builder.add(entry, wordForm, morphDataID);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return new WordStorage(builder) {
|
||||||
|
@Override
|
||||||
|
char caseFold(char c) {
|
||||||
|
return Dictionary.this.caseFold(c);
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -0,0 +1,181 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
import java.io.Closeable;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.BytesRefComparator;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.util.OfflineSorter;
|
||||||
|
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
|
||||||
|
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The strategy defining how a Hunspell dictionary should be loaded, with different tradeoffs. The
|
||||||
|
* entries should be sorted in a special way, and this can be done either in-memory (faster, but
|
||||||
|
* temporarily allocating more memory) or using disk (slower, but not needing much memory).
|
||||||
|
*
|
||||||
|
* @see #offline(Directory, String)
|
||||||
|
* @see #inMemory()
|
||||||
|
*/
|
||||||
|
public abstract class SortingStrategy {
|
||||||
|
|
||||||
|
abstract EntryAccumulator start() throws IOException;
|
||||||
|
|
||||||
|
interface EntryAccumulator {
|
||||||
|
|
||||||
|
void addEntry(String entry) throws IOException;
|
||||||
|
|
||||||
|
EntrySupplier finishAndSort() throws IOException;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface EntrySupplier extends Closeable {
|
||||||
|
int wordCount();
|
||||||
|
|
||||||
|
/** The next line or {@code null} if the end is reached */
|
||||||
|
String next() throws IOException;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An "offline" strategy that creates temporary files in the given directory and uses them for
|
||||||
|
* sorting with {@link OfflineSorter}. It's slower than {@link #inMemory()}, but doesn't need to
|
||||||
|
* load the entire dictionary into memory.
|
||||||
|
*/
|
||||||
|
public static SortingStrategy offline(Directory tempDir, String tempFileNamePrefix) {
|
||||||
|
return new SortingStrategy() {
|
||||||
|
@Override
|
||||||
|
EntryAccumulator start() throws IOException {
|
||||||
|
IndexOutput output = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
|
||||||
|
ByteSequencesWriter writer = new ByteSequencesWriter(output);
|
||||||
|
return new EntryAccumulator() {
|
||||||
|
int wordCount = 0;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void addEntry(String entry) throws IOException {
|
||||||
|
wordCount++;
|
||||||
|
writer.write(entry.getBytes(StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public EntrySupplier finishAndSort() throws IOException {
|
||||||
|
CodecUtil.writeFooter(output);
|
||||||
|
writer.close();
|
||||||
|
String sortedFile = sortWordsOffline();
|
||||||
|
ByteSequencesReader reader =
|
||||||
|
new ByteSequencesReader(tempDir.openChecksumInput(sortedFile), sortedFile);
|
||||||
|
return new EntrySupplier() {
|
||||||
|
boolean success = false;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int wordCount() {
|
||||||
|
return wordCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String next() throws IOException {
|
||||||
|
BytesRef scratch = reader.next();
|
||||||
|
if (scratch == null) {
|
||||||
|
success = true;
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return scratch.utf8ToString();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
reader.close();
|
||||||
|
if (success) {
|
||||||
|
tempDir.deleteFile(sortedFile);
|
||||||
|
} else {
|
||||||
|
IOUtils.deleteFilesIgnoringExceptions(tempDir, sortedFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private String sortWordsOffline() throws IOException {
|
||||||
|
var sorter = new OfflineSorter(tempDir, tempFileNamePrefix, BytesRefComparator.NATURAL);
|
||||||
|
|
||||||
|
String sorted;
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
sorted = sorter.sort(output.getName());
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
if (success) {
|
||||||
|
tempDir.deleteFile(output.getName());
|
||||||
|
} else {
|
||||||
|
IOUtils.deleteFilesIgnoringExceptions(tempDir, output.getName());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sorted;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The strategy that loads all entries as {@link String} objects and sorts them in memory. The
|
||||||
|
* entries are then stored in a more compressed way, and the strings are gc-ed, but the loading
|
||||||
|
* itself needs {@code O(dictionary_size)} memory.
|
||||||
|
*/
|
||||||
|
public static SortingStrategy inMemory() {
|
||||||
|
return new SortingStrategy() {
|
||||||
|
@Override
|
||||||
|
EntryAccumulator start() {
|
||||||
|
List<String> entries = new ArrayList<>();
|
||||||
|
return new EntryAccumulator() {
|
||||||
|
@Override
|
||||||
|
public void addEntry(String entry) {
|
||||||
|
entries.add(entry);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public EntrySupplier finishAndSort() {
|
||||||
|
entries.sort(Comparator.naturalOrder());
|
||||||
|
return new EntrySupplier() {
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int wordCount() {
|
||||||
|
return entries.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String next() {
|
||||||
|
return i < entries.size() ? entries.get(i++) : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() {}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
|
@ -41,7 +41,6 @@ import java.util.concurrent.atomic.AtomicLong;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
import org.apache.lucene.tests.store.BaseDirectoryWrapper;
|
|
||||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||||
import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks;
|
import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks;
|
||||||
import org.apache.lucene.tests.util.RamUsageTester;
|
import org.apache.lucene.tests.util.RamUsageTester;
|
||||||
|
@ -72,9 +71,8 @@ public class TestAllDictionaries extends LuceneTestCase {
|
||||||
Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic");
|
Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic");
|
||||||
assert Files.exists(dic) : dic;
|
assert Files.exists(dic) : dic;
|
||||||
try (InputStream dictionary = Files.newInputStream(dic);
|
try (InputStream dictionary = Files.newInputStream(dic);
|
||||||
InputStream affix = Files.newInputStream(aff);
|
InputStream affix = Files.newInputStream(aff)) {
|
||||||
BaseDirectoryWrapper tempDir = newDirectory()) {
|
return new Dictionary(affix, List.of(dictionary), false, SortingStrategy.inMemory()) {
|
||||||
return new Dictionary(tempDir, "dictionary", affix, dictionary) {
|
|
||||||
@Override
|
@Override
|
||||||
protected boolean tolerateAffixRuleCountMismatches() {
|
protected boolean tolerateAffixRuleCountMismatches() {
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -256,15 +256,22 @@ public class TestSpellChecking extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void checkSpellCheckerExpectations(Path basePath) throws IOException, ParseException {
|
static void checkSpellCheckerExpectations(Path basePath) throws IOException, ParseException {
|
||||||
InputStream affixStream = Files.newInputStream(Path.of(basePath.toString() + ".aff"));
|
checkSpellCheckerExpectations(
|
||||||
|
basePath, SortingStrategy.offline(new ByteBuffersDirectory(), "dictionary"));
|
||||||
|
checkSpellCheckerExpectations(basePath, SortingStrategy.inMemory());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void checkSpellCheckerExpectations(Path basePath, SortingStrategy strategy)
|
||||||
|
throws IOException, ParseException {
|
||||||
|
Path affFile = Path.of(basePath + ".aff");
|
||||||
Path dicFile = Path.of(basePath + ".dic");
|
Path dicFile = Path.of(basePath + ".dic");
|
||||||
|
InputStream affixStream = Files.newInputStream(affFile);
|
||||||
InputStream dictStream = Files.newInputStream(dicFile);
|
InputStream dictStream = Files.newInputStream(dicFile);
|
||||||
|
|
||||||
Hunspell speller;
|
Hunspell speller;
|
||||||
Map<String, Suggester> suggesters = new LinkedHashMap<>();
|
Map<String, Suggester> suggesters = new LinkedHashMap<>();
|
||||||
try {
|
try {
|
||||||
Dictionary dictionary =
|
Dictionary dictionary = new Dictionary(affixStream, List.of(dictStream), false, strategy);
|
||||||
new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
|
|
||||||
speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
|
speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
|
||||||
Suggester suggester = new Suggester(dictionary);
|
Suggester suggester = new Suggester(dictionary);
|
||||||
suggesters.put("default", suggester);
|
suggesters.put("default", suggester);
|
||||||
|
|
Loading…
Reference in New Issue