mirror of https://github.com/apache/lucene.git
hunspell: make the hash table load factor customizable (#12464)
* hunspell: make the hash table load factor customizable
This commit is contained in:
parent
155b2edbe3
commit
1af68bf2d7
|
@ -77,6 +77,8 @@ Improvements
|
||||||
|
|
||||||
* GITHUB#12447: Hunspell: speed up the dictionary enumeration (Peter Gromov)
|
* GITHUB#12447: Hunspell: speed up the dictionary enumeration (Peter Gromov)
|
||||||
|
|
||||||
|
* GITHUB#12464: Hunspell: allow customizing the hash table load factor (Peter Gromov)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
|
|
@ -1133,7 +1133,8 @@ public class Dictionary {
|
||||||
Map<String, Integer> morphIndices = new HashMap<>();
|
Map<String, Integer> morphIndices = new HashMap<>();
|
||||||
|
|
||||||
WordStorage.Builder builder =
|
WordStorage.Builder builder =
|
||||||
new WordStorage.Builder(wordCount, hasCustomMorphData, flags, allNonSuggestibleFlags());
|
new WordStorage.Builder(
|
||||||
|
wordCount, hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags());
|
||||||
|
|
||||||
try (ByteSequencesReader reader =
|
try (ByteSequencesReader reader =
|
||||||
new ByteSequencesReader(tempDir.openChecksumInput(sorted), sorted)) {
|
new ByteSequencesReader(tempDir.openChecksumInput(sorted), sorted)) {
|
||||||
|
@ -1204,6 +1205,16 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The factor determining the size of the internal hash table used for storing the entries. The
|
||||||
|
* table size is {@code entry_count * hashFactor}. The default factor is 1.0. If there are too
|
||||||
|
* many hash collisions, the factor can be increased, resulting in faster access, but more memory
|
||||||
|
* usage.
|
||||||
|
*/
|
||||||
|
protected double hashFactor() {
|
||||||
|
return 1.0;
|
||||||
|
}
|
||||||
|
|
||||||
char[] allNonSuggestibleFlags() {
|
char[] allNonSuggestibleFlags() {
|
||||||
return Dictionary.toSortedCharArray(
|
return Dictionary.toSortedCharArray(
|
||||||
Stream.of(HIDDEN_FLAG, noSuggest, forbiddenword, onlyincompound, subStandard)
|
Stream.of(HIDDEN_FLAG, noSuggest, forbiddenword, onlyincompound, subStandard)
|
||||||
|
|
|
@ -264,6 +264,7 @@ abstract class WordStorage {
|
||||||
private final List<Integer> morphDataIDs = new ArrayList<>();
|
private final List<Integer> morphDataIDs = new ArrayList<>();
|
||||||
private String currentEntry = null;
|
private String currentEntry = null;
|
||||||
private final int wordCount;
|
private final int wordCount;
|
||||||
|
private final double hashFactor;
|
||||||
private final FlagEnumerator flagEnumerator;
|
private final FlagEnumerator flagEnumerator;
|
||||||
|
|
||||||
private final ByteArrayDataOutput dataWriter;
|
private final ByteArrayDataOutput dataWriter;
|
||||||
|
@ -278,15 +279,17 @@ abstract class WordStorage {
|
||||||
*/
|
*/
|
||||||
Builder(
|
Builder(
|
||||||
int wordCount,
|
int wordCount,
|
||||||
|
double hashFactor,
|
||||||
boolean hasCustomMorphData,
|
boolean hasCustomMorphData,
|
||||||
FlagEnumerator flagEnumerator,
|
FlagEnumerator flagEnumerator,
|
||||||
char[] noSuggestFlags) {
|
char[] noSuggestFlags) {
|
||||||
this.wordCount = wordCount;
|
this.wordCount = wordCount;
|
||||||
|
this.hashFactor = hashFactor;
|
||||||
this.flagEnumerator = flagEnumerator;
|
this.flagEnumerator = flagEnumerator;
|
||||||
this.hasCustomMorphData = hasCustomMorphData;
|
this.hasCustomMorphData = hasCustomMorphData;
|
||||||
this.noSuggestFlags = noSuggestFlags;
|
this.noSuggestFlags = noSuggestFlags;
|
||||||
|
|
||||||
hashTable = new int[wordCount];
|
hashTable = new int[(int) (wordCount * hashFactor)];
|
||||||
wordData = new byte[wordCount * 6];
|
wordData = new byte[wordCount * 6];
|
||||||
|
|
||||||
dataWriter =
|
dataWriter =
|
||||||
|
@ -390,7 +393,9 @@ abstract class WordStorage {
|
||||||
|
|
||||||
if (++chainLengths[hash] > 20) {
|
if (++chainLengths[hash] > 20) {
|
||||||
throw new RuntimeException(
|
throw new RuntimeException(
|
||||||
"Too many collisions, please report this to dev@lucene.apache.org");
|
"Too many collisions. "
|
||||||
|
+ ("Try a larger Dictionary#hashFactor (now " + hashFactor + "). ")
|
||||||
|
+ "If this doesn't help, please report this to dev@lucene.apache.org");
|
||||||
}
|
}
|
||||||
|
|
||||||
// write the leaf entry for the last character
|
// write the leaf entry for the last character
|
||||||
|
|
Loading…
Reference in New Issue