mirror of https://github.com/apache/lucene.git
hunspell: a couple micro-optimizations to speed up dictionary loading (#12825)
* hunspell: a couple micro-optimizations to speed up dictionary loading 1. sort by the whole entry without searching for separators first: WordStorage doesn't require strong lexicographic order (only something close to it), and the separators are anyway before any usual word characters 2. avoid stream overhead when adding an entry
This commit is contained in:
parent
7eb8f6ee00
commit
c23b3b3301
|
@ -88,6 +88,8 @@ Optimizations
|
|||
|
||||
* GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance (Peter Gromov)
|
||||
|
||||
* GITHUB#12825: Hunspell: improved dictionary loading performance (Peter Gromov)
|
||||
|
||||
* GITHUB#12372: Reduce allocation during HNSW construction (Jonathan Ellis)
|
||||
|
||||
* GITHUB#12408: Lazy initialization improvements for Facets implementations when there are segments with no hits
|
||||
|
|
|
@ -56,6 +56,7 @@ import org.apache.lucene.store.IOContext;
|
|||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefComparator;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
|
@ -1087,42 +1088,7 @@ public class Dictionary {
|
|||
|
||||
private String sortWordsOffline(
|
||||
Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
|
||||
OfflineSorter sorter =
|
||||
new OfflineSorter(
|
||||
tempDir,
|
||||
tempFileNamePrefix,
|
||||
new Comparator<>() {
|
||||
final BytesRef scratch1 = new BytesRef();
|
||||
final BytesRef scratch2 = new BytesRef();
|
||||
|
||||
private void initScratch(BytesRef o, BytesRef scratch) {
|
||||
scratch.bytes = o.bytes;
|
||||
scratch.offset = o.offset;
|
||||
scratch.length = o.length;
|
||||
|
||||
for (int i = scratch.length - 1; i >= 0; i--) {
|
||||
if (scratch.bytes[scratch.offset + i] == FLAG_SEPARATOR
|
||||
|| scratch.bytes[scratch.offset + i] == MORPH_SEPARATOR) {
|
||||
scratch.length = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compare(BytesRef o1, BytesRef o2) {
|
||||
initScratch(o1, scratch1);
|
||||
initScratch(o2, scratch2);
|
||||
|
||||
int cmp = scratch1.compareTo(scratch2);
|
||||
if (cmp == 0) {
|
||||
// tie break on whole row
|
||||
return o1.compareTo(o2);
|
||||
} else {
|
||||
return cmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
var sorter = new OfflineSorter(tempDir, tempFileNamePrefix, BytesRefComparator.NATURAL);
|
||||
|
||||
String sorted;
|
||||
boolean success = false;
|
||||
|
|
|
@ -350,16 +350,19 @@ abstract class WordStorage {
|
|||
|
||||
currentOrds.clear();
|
||||
boolean hasNonHidden = false;
|
||||
boolean isSuggestible = false;
|
||||
for (char[] flags : group) {
|
||||
if (!hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
|
||||
hasNonHidden = true;
|
||||
break;
|
||||
}
|
||||
if (!hasNoSuggestFlag(flags)) {
|
||||
isSuggestible = true;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < group.size(); i++) {
|
||||
char[] flags = group.get(i);
|
||||
if (hasNonHidden && hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
|
||||
if (hasNonHidden && group.size() > 1 && hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -388,7 +391,7 @@ abstract class WordStorage {
|
|||
|
||||
int mask =
|
||||
(prevCode == 0 ? 0 : COLLISION_MASK)
|
||||
| (group.stream().anyMatch(flags -> !hasNoSuggestFlag(flags)) ? SUGGESTIBLE_MASK : 0)
|
||||
| (isSuggestible ? SUGGESTIBLE_MASK : 0)
|
||||
| Math.min(currentEntry.length(), MAX_STORED_LENGTH);
|
||||
hashTable[hash] = (mask << OFFSET_BITS) | pos;
|
||||
|
||||
|
|
Loading…
Reference in New Issue