From 58e3b7a8542fa8d94a77215fdf13d575a5657546 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Fri, 19 Feb 2021 20:10:06 +0100 Subject: [PATCH] LUCENE-9790: Hunspell: avoid slow dictionary lookup if the word's hash isn't there (#2405) --- .../lucene/analysis/hunspell/Dictionary.java | 39 ++++++++++++------- .../hunspell/TestAllDictionaries.java | 3 +- .../apache/lucene/analysis/CharArrayMap.java | 12 +++--- .../java/org/apache/lucene/util/CharsRef.java | 13 +++++-- 4 files changed, 43 insertions(+), 24 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 7b1fdf96589..5a23d76b218 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -51,6 +51,8 @@ import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRefBuilder; @@ -98,6 +100,9 @@ public class Dictionary { */ FST words; + /** A Bloom filter over {@link #words} to avoid unnecessary expensive FST traversals */ + FixedBitSet wordHashes; + /** * The list of unique flagsets (wordforms). theoretically huge, but practically small (for Polish * this is 756), otherwise humans wouldn't be able to deal with it either. @@ -249,7 +254,9 @@ public class Dictionary { readAffixFile(affixStream, decoder, flagEnumerator); // read dictionary entries - IndexOutput unsorted = mergeDictionaries(tempDir, tempFileNamePrefix, dictionaries, decoder); + IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT); + int wordCount = mergeDictionaries(dictionaries, decoder, unsorted); + wordHashes = new FixedBitSet(Integer.highestOneBit(wordCount * 10)); String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted); words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator); flagLookup = flagEnumerator.finish(); @@ -264,6 +271,11 @@ public class Dictionary { /** Looks up Hunspell word forms from the dictionary */ IntsRef lookupWord(char[] word, int offset, int length) { + int hash = CharsRef.stringHashCode(word, offset, length); + if (!wordHashes.get(Math.abs(hash) % wordHashes.length())) { + return null; + } + return lookup(words, word, offset, length); } @@ -1015,15 +1027,12 @@ public class Dictionary { } } - private IndexOutput mergeDictionaries( - Directory tempDir, - String tempFileNamePrefix, - List dictionaries, - CharsetDecoder decoder) + private int mergeDictionaries( + List dictionaries, CharsetDecoder decoder, IndexOutput output) throws IOException { StringBuilder sb = new StringBuilder(); - IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT); - try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) { + int wordCount = 0; + try (ByteSequencesWriter writer = new ByteSequencesWriter(output)) { for (InputStream dictionary : dictionaries) { BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); lines.readLine(); // first line is number of entries (approximately, sometimes) @@ -1045,16 +1054,17 @@ public class Dictionary { } } - writeNormalizedWordEntry(sb, writer, line); + wordCount += writeNormalizedWordEntry(sb, writer, line); } } - CodecUtil.writeFooter(unsorted); + CodecUtil.writeFooter(output); } - return unsorted; + return wordCount; } - private void writeNormalizedWordEntry( - StringBuilder reuse, ByteSequencesWriter writer, String line) throws IOException { + /** @return the number of word entries written */ + private int writeNormalizedWordEntry(StringBuilder reuse, ByteSequencesWriter writer, String line) + throws IOException { int flagSep = line.indexOf(FLAG_SEPARATOR); int morphSep = line.indexOf(MORPH_SEPARATOR); assert morphSep > 0; @@ -1078,7 +1088,9 @@ public class Dictionary { WordCase wordCase = WordCase.caseOf(written, sep); if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) { addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep)); + return 2; } + return 1; } private void addHiddenCapitalizedWord( @@ -1221,6 +1233,7 @@ public class Dictionary { } } + wordHashes.set(Math.abs(entry.hashCode()) % wordHashes.length()); grouper.add(entry, wordForm, morphDataID); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java index acef45bb4e1..6fac33d2d9f 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java @@ -160,7 +160,8 @@ public class TestAllDictionaries extends LuceneTestCase { try { Dictionary dic = loadDictionary(aff); totalMemory.addAndGet(RamUsageTester.sizeOf(dic)); - totalWords.addAndGet(RamUsageTester.sizeOf(dic.words)); + totalWords.addAndGet( + RamUsageTester.sizeOf(dic.words) + RamUsageTester.sizeOf(dic.wordHashes)); System.out.println(aff + "\t" + memoryUsageSummary(dic)); } catch (Throwable e) { failures.add(aff); diff --git a/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java b/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java index ea94f96d0b0..eace7d0edb3 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java @@ -22,6 +22,7 @@ import java.util.Arrays; import java.util.Iterator; import java.util.Map; import java.util.Set; +import org.apache.lucene.util.CharsRef; /** * A simple class that stores key Strings as char[]'s in a hash table. Note that this is not a @@ -266,20 +267,17 @@ public class CharArrayMap extends AbstractMap { private int getHashCode(char[] text, int offset, int len) { if (text == null) throw new NullPointerException(); - int code = 0; - final int stop = offset + len; if (ignoreCase) { + int stop = offset + len; + int code = 0; for (int i = offset; i < stop; ) { final int codePointAt = Character.codePointAt(text, i, stop); code = code * 31 + Character.toLowerCase(codePointAt); i += Character.charCount(codePointAt); } - } else { - for (int i = offset; i < stop; i++) { - code = code * 31 + text[i]; - } + return code; } - return code; + return CharsRef.stringHashCode(text, offset, len); } private int getHashCode(CharSequence text) { diff --git a/lucene/core/src/java/org/apache/lucene/util/CharsRef.java b/lucene/core/src/java/org/apache/lucene/util/CharsRef.java index cf1f96969ea..395bd865330 100644 --- a/lucene/core/src/java/org/apache/lucene/util/CharsRef.java +++ b/lucene/core/src/java/org/apache/lucene/util/CharsRef.java @@ -74,11 +74,18 @@ public final class CharsRef implements Comparable, CharSequence, Clone @Override public int hashCode() { - final int prime = 31; + return stringHashCode(chars, offset, length); + } + + /** + * @return the hash code of the given char sub-array, calculated by {@link String#hashCode()} + * specification + */ + public static int stringHashCode(char[] chars, int offset, int length) { + int end = offset + length; int result = 0; - final int end = offset + length; for (int i = offset; i < end; i++) { - result = prime * result + chars[i]; + result = 31 * result + chars[i]; } return result; }