From 1af68bf2d7ee73b72309b7a6fcf5361051087ef0 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Fri, 28 Jul 2023 18:36:32 +0200 Subject: [PATCH] hunspell: make the hash table load factor customizable (#12464) * hunspell: make the hash table load factor customizable --- lucene/CHANGES.txt | 2 ++ .../apache/lucene/analysis/hunspell/Dictionary.java | 13 ++++++++++++- .../lucene/analysis/hunspell/WordStorage.java | 9 +++++++-- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index e4b81d05ca0..7f31158c3f5 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -77,6 +77,8 @@ Improvements * GITHUB#12447: Hunspell: speed up the dictionary enumeration (Peter Gromov) +* GITHUB#12464: Hunspell: allow customizing the hash table load factor (Peter Gromov) + Optimizations --------------------- diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 50c11159a0d..ae2c7b05439 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -1133,7 +1133,8 @@ public class Dictionary { Map morphIndices = new HashMap<>(); WordStorage.Builder builder = - new WordStorage.Builder(wordCount, hasCustomMorphData, flags, allNonSuggestibleFlags()); + new WordStorage.Builder( + wordCount, hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags()); try (ByteSequencesReader reader = new ByteSequencesReader(tempDir.openChecksumInput(sorted), sorted)) { @@ -1204,6 +1205,16 @@ public class Dictionary { } } + /** + * The factor determining the size of the internal hash table used for storing the entries. The + * table size is {@code entry_count * hashFactor}. The default factor is 1.0. If there are too + * many hash collisions, the factor can be increased, resulting in faster access, but more memory + * usage. + */ + protected double hashFactor() { + return 1.0; + } + char[] allNonSuggestibleFlags() { return Dictionary.toSortedCharArray( Stream.of(HIDDEN_FLAG, noSuggest, forbiddenword, onlyincompound, subStandard) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java index 4dcbca3135e..82cf3153ace 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java @@ -264,6 +264,7 @@ abstract class WordStorage { private final List morphDataIDs = new ArrayList<>(); private String currentEntry = null; private final int wordCount; + private final double hashFactor; private final FlagEnumerator flagEnumerator; private final ByteArrayDataOutput dataWriter; @@ -278,15 +279,17 @@ abstract class WordStorage { */ Builder( int wordCount, + double hashFactor, boolean hasCustomMorphData, FlagEnumerator flagEnumerator, char[] noSuggestFlags) { this.wordCount = wordCount; + this.hashFactor = hashFactor; this.flagEnumerator = flagEnumerator; this.hasCustomMorphData = hasCustomMorphData; this.noSuggestFlags = noSuggestFlags; - hashTable = new int[wordCount]; + hashTable = new int[(int) (wordCount * hashFactor)]; wordData = new byte[wordCount * 6]; dataWriter = @@ -390,7 +393,9 @@ abstract class WordStorage { if (++chainLengths[hash] > 20) { throw new RuntimeException( - "Too many collisions, please report this to dev@lucene.apache.org"); + "Too many collisions. " + + ("Try a larger Dictionary#hashFactor (now " + hashFactor + "). ") + + "If this doesn't help, please report this to dev@lucene.apache.org"); } // write the leaf entry for the last character