hunspell: a couple micro-optimizations to speed up dictionary loading (#12825)

* hunspell: a couple micro-optimizations to speed up dictionary loading

1. sort by the whole entry without searching for separators first: WordStorage doesn't require strong lexicographic order (only something close to it), and the separators are anyway before any usual word characters
2. avoid stream overhead when adding an entry
This commit is contained in:
Peter Gromov 2023-11-21 08:05:42 +01:00 committed by GitHub
parent 7eb8f6ee00
commit c23b3b3301
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 10 additions and 39 deletions

View File

@ -88,6 +88,8 @@ Optimizations
* GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance (Peter Gromov)
* GITHUB#12825: Hunspell: improved dictionary loading performance (Peter Gromov)
* GITHUB#12372: Reduce allocation during HNSW construction (Jonathan Ellis)
* GITHUB#12408: Lazy initialization improvements for Facets implementations when there are segments with no hits

View File

@ -56,6 +56,7 @@ import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefComparator;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
@ -1087,42 +1088,7 @@ public class Dictionary {
private String sortWordsOffline(
Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
OfflineSorter sorter =
new OfflineSorter(
tempDir,
tempFileNamePrefix,
new Comparator<>() {
final BytesRef scratch1 = new BytesRef();
final BytesRef scratch2 = new BytesRef();
private void initScratch(BytesRef o, BytesRef scratch) {
scratch.bytes = o.bytes;
scratch.offset = o.offset;
scratch.length = o.length;
for (int i = scratch.length - 1; i >= 0; i--) {
if (scratch.bytes[scratch.offset + i] == FLAG_SEPARATOR
|| scratch.bytes[scratch.offset + i] == MORPH_SEPARATOR) {
scratch.length = i;
break;
}
}
}
@Override
public int compare(BytesRef o1, BytesRef o2) {
initScratch(o1, scratch1);
initScratch(o2, scratch2);
int cmp = scratch1.compareTo(scratch2);
if (cmp == 0) {
// tie break on whole row
return o1.compareTo(o2);
} else {
return cmp;
}
}
});
var sorter = new OfflineSorter(tempDir, tempFileNamePrefix, BytesRefComparator.NATURAL);
String sorted;
boolean success = false;

View File

@ -350,16 +350,19 @@ abstract class WordStorage {
currentOrds.clear();
boolean hasNonHidden = false;
boolean isSuggestible = false;
for (char[] flags : group) {
if (!hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
hasNonHidden = true;
break;
}
if (!hasNoSuggestFlag(flags)) {
isSuggestible = true;
}
}
for (int i = 0; i < group.size(); i++) {
char[] flags = group.get(i);
if (hasNonHidden && hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
if (hasNonHidden && group.size() > 1 && hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
continue;
}
@ -388,7 +391,7 @@ abstract class WordStorage {
int mask =
(prevCode == 0 ? 0 : COLLISION_MASK)
| (group.stream().anyMatch(flags -> !hasNoSuggestFlag(flags)) ? SUGGESTIBLE_MASK : 0)
| (isSuggestible ? SUGGESTIBLE_MASK : 0)
| Math.min(currentEntry.length(), MAX_STORED_LENGTH);
hashTable[hash] = (mask << OFFSET_BITS) | pos;