hunspell: make the hash table load factor customizable (#12464)

* hunspell: make the hash table load factor customizable
This commit is contained in:
Peter Gromov 2023-07-28 18:36:32 +02:00 committed by GitHub
parent 155b2edbe3
commit 1af68bf2d7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 21 additions and 3 deletions

View File

@ -77,6 +77,8 @@ Improvements
* GITHUB#12447: Hunspell: speed up the dictionary enumeration (Peter Gromov)
* GITHUB#12464: Hunspell: allow customizing the hash table load factor (Peter Gromov)
Optimizations
---------------------

View File

@ -1133,7 +1133,8 @@ public class Dictionary {
Map<String, Integer> morphIndices = new HashMap<>();
WordStorage.Builder builder =
new WordStorage.Builder(wordCount, hasCustomMorphData, flags, allNonSuggestibleFlags());
new WordStorage.Builder(
wordCount, hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags());
try (ByteSequencesReader reader =
new ByteSequencesReader(tempDir.openChecksumInput(sorted), sorted)) {
@ -1204,6 +1205,16 @@ public class Dictionary {
}
}
/**
* The factor determining the size of the internal hash table used for storing the entries. The
* table size is {@code entry_count * hashFactor}. The default factor is 1.0. If there are too
* many hash collisions, the factor can be increased, resulting in faster access, but more memory
* usage.
*/
protected double hashFactor() {
return 1.0;
}
char[] allNonSuggestibleFlags() {
return Dictionary.toSortedCharArray(
Stream.of(HIDDEN_FLAG, noSuggest, forbiddenword, onlyincompound, subStandard)

View File

@ -264,6 +264,7 @@ abstract class WordStorage {
private final List<Integer> morphDataIDs = new ArrayList<>();
private String currentEntry = null;
private final int wordCount;
private final double hashFactor;
private final FlagEnumerator flagEnumerator;
private final ByteArrayDataOutput dataWriter;
@ -278,15 +279,17 @@ abstract class WordStorage {
*/
Builder(
int wordCount,
double hashFactor,
boolean hasCustomMorphData,
FlagEnumerator flagEnumerator,
char[] noSuggestFlags) {
this.wordCount = wordCount;
this.hashFactor = hashFactor;
this.flagEnumerator = flagEnumerator;
this.hasCustomMorphData = hasCustomMorphData;
this.noSuggestFlags = noSuggestFlags;
hashTable = new int[wordCount];
hashTable = new int[(int) (wordCount * hashFactor)];
wordData = new byte[wordCount * 6];
dataWriter =
@ -390,7 +393,9 @@ abstract class WordStorage {
if (++chainLengths[hash] > 20) {
throw new RuntimeException(
"Too many collisions, please report this to dev@lucene.apache.org");
"Too many collisions. "
+ ("Try a larger Dictionary#hashFactor (now " + hashFactor + "). ")
+ "If this doesn't help, please report this to dev@lucene.apache.org");
}
// write the leaf entry for the last character