diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index c1980f5611c..9661d13e706 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -88,6 +88,8 @@ Optimizations * GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance (Peter Gromov) +* GITHUB#12825: Hunspell: improved dictionary loading performance (Peter Gromov) + * GITHUB#12372: Reduce allocation during HNSW construction (Jonathan Ellis) * GITHUB#12408: Lazy initialization improvements for Facets implementations when there are segments with no hits diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index c197b16a7b7..d453404bb54 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -56,6 +56,7 @@ import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefComparator; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRefBuilder; @@ -1087,42 +1088,7 @@ public class Dictionary { private String sortWordsOffline( Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException { - OfflineSorter sorter = - new OfflineSorter( - tempDir, - tempFileNamePrefix, - new Comparator<>() { - final BytesRef scratch1 = new BytesRef(); - final BytesRef scratch2 = new BytesRef(); - - private void initScratch(BytesRef o, BytesRef scratch) { - scratch.bytes = o.bytes; - scratch.offset = o.offset; - scratch.length = o.length; - - for (int i = scratch.length - 1; i >= 0; i--) { - if (scratch.bytes[scratch.offset + i] == FLAG_SEPARATOR - || scratch.bytes[scratch.offset + i] == MORPH_SEPARATOR) { - scratch.length = i; - break; - } - } - } - - @Override - public int compare(BytesRef o1, BytesRef o2) { - initScratch(o1, scratch1); - initScratch(o2, scratch2); - - int cmp = scratch1.compareTo(scratch2); - if (cmp == 0) { - // tie break on whole row - return o1.compareTo(o2); - } else { - return cmp; - } - } - }); + var sorter = new OfflineSorter(tempDir, tempFileNamePrefix, BytesRefComparator.NATURAL); String sorted; boolean success = false; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java index 0b5e4b86978..1e739f03d47 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java @@ -350,16 +350,19 @@ abstract class WordStorage { currentOrds.clear(); boolean hasNonHidden = false; + boolean isSuggestible = false; for (char[] flags : group) { if (!hasFlag(flags, Dictionary.HIDDEN_FLAG)) { hasNonHidden = true; - break; + } + if (!hasNoSuggestFlag(flags)) { + isSuggestible = true; } } for (int i = 0; i < group.size(); i++) { char[] flags = group.get(i); - if (hasNonHidden && hasFlag(flags, Dictionary.HIDDEN_FLAG)) { + if (hasNonHidden && group.size() > 1 && hasFlag(flags, Dictionary.HIDDEN_FLAG)) { continue; } @@ -388,7 +391,7 @@ abstract class WordStorage { int mask = (prevCode == 0 ? 0 : COLLISION_MASK) - | (group.stream().anyMatch(flags -> !hasNoSuggestFlag(flags)) ? SUGGESTIBLE_MASK : 0) + | (isSuggestible ? SUGGESTIBLE_MASK : 0) | Math.min(currentEntry.length(), MAX_STORED_LENGTH); hashTable[hash] = (mask << OFFSET_BITS) | pos;