mirror of https://github.com/apache/lucene.git
LUCENE-9790: Hunspell: avoid slow dictionary lookup if the word's hash isn't there (#2405)
This commit is contained in:
parent
4b3fb1e065
commit
58e3b7a854
|
@ -51,6 +51,8 @@ import org.apache.lucene.store.IOContext;
|
|||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
|
@ -98,6 +100,9 @@ public class Dictionary {
|
|||
*/
|
||||
FST<IntsRef> words;
|
||||
|
||||
/** A Bloom filter over {@link #words} to avoid unnecessary expensive FST traversals */
|
||||
FixedBitSet wordHashes;
|
||||
|
||||
/**
|
||||
* The list of unique flagsets (wordforms). theoretically huge, but practically small (for Polish
|
||||
* this is 756), otherwise humans wouldn't be able to deal with it either.
|
||||
|
@ -249,7 +254,9 @@ public class Dictionary {
|
|||
readAffixFile(affixStream, decoder, flagEnumerator);
|
||||
|
||||
// read dictionary entries
|
||||
IndexOutput unsorted = mergeDictionaries(tempDir, tempFileNamePrefix, dictionaries, decoder);
|
||||
IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
|
||||
int wordCount = mergeDictionaries(dictionaries, decoder, unsorted);
|
||||
wordHashes = new FixedBitSet(Integer.highestOneBit(wordCount * 10));
|
||||
String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
|
||||
words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator);
|
||||
flagLookup = flagEnumerator.finish();
|
||||
|
@ -264,6 +271,11 @@ public class Dictionary {
|
|||
|
||||
/** Looks up Hunspell word forms from the dictionary */
|
||||
IntsRef lookupWord(char[] word, int offset, int length) {
|
||||
int hash = CharsRef.stringHashCode(word, offset, length);
|
||||
if (!wordHashes.get(Math.abs(hash) % wordHashes.length())) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return lookup(words, word, offset, length);
|
||||
}
|
||||
|
||||
|
@ -1015,15 +1027,12 @@ public class Dictionary {
|
|||
}
|
||||
}
|
||||
|
||||
private IndexOutput mergeDictionaries(
|
||||
Directory tempDir,
|
||||
String tempFileNamePrefix,
|
||||
List<InputStream> dictionaries,
|
||||
CharsetDecoder decoder)
|
||||
private int mergeDictionaries(
|
||||
List<InputStream> dictionaries, CharsetDecoder decoder, IndexOutput output)
|
||||
throws IOException {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
|
||||
try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
|
||||
int wordCount = 0;
|
||||
try (ByteSequencesWriter writer = new ByteSequencesWriter(output)) {
|
||||
for (InputStream dictionary : dictionaries) {
|
||||
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
|
||||
lines.readLine(); // first line is number of entries (approximately, sometimes)
|
||||
|
@ -1045,16 +1054,17 @@ public class Dictionary {
|
|||
}
|
||||
}
|
||||
|
||||
writeNormalizedWordEntry(sb, writer, line);
|
||||
wordCount += writeNormalizedWordEntry(sb, writer, line);
|
||||
}
|
||||
}
|
||||
CodecUtil.writeFooter(unsorted);
|
||||
CodecUtil.writeFooter(output);
|
||||
}
|
||||
return unsorted;
|
||||
return wordCount;
|
||||
}
|
||||
|
||||
private void writeNormalizedWordEntry(
|
||||
StringBuilder reuse, ByteSequencesWriter writer, String line) throws IOException {
|
||||
/** @return the number of word entries written */
|
||||
private int writeNormalizedWordEntry(StringBuilder reuse, ByteSequencesWriter writer, String line)
|
||||
throws IOException {
|
||||
int flagSep = line.indexOf(FLAG_SEPARATOR);
|
||||
int morphSep = line.indexOf(MORPH_SEPARATOR);
|
||||
assert morphSep > 0;
|
||||
|
@ -1078,7 +1088,9 @@ public class Dictionary {
|
|||
WordCase wordCase = WordCase.caseOf(written, sep);
|
||||
if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
|
||||
addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep));
|
||||
return 2;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
private void addHiddenCapitalizedWord(
|
||||
|
@ -1221,6 +1233,7 @@ public class Dictionary {
|
|||
}
|
||||
}
|
||||
|
||||
wordHashes.set(Math.abs(entry.hashCode()) % wordHashes.length());
|
||||
grouper.add(entry, wordForm, morphDataID);
|
||||
}
|
||||
|
||||
|
|
|
@ -160,7 +160,8 @@ public class TestAllDictionaries extends LuceneTestCase {
|
|||
try {
|
||||
Dictionary dic = loadDictionary(aff);
|
||||
totalMemory.addAndGet(RamUsageTester.sizeOf(dic));
|
||||
totalWords.addAndGet(RamUsageTester.sizeOf(dic.words));
|
||||
totalWords.addAndGet(
|
||||
RamUsageTester.sizeOf(dic.words) + RamUsageTester.sizeOf(dic.wordHashes));
|
||||
System.out.println(aff + "\t" + memoryUsageSummary(dic));
|
||||
} catch (Throwable e) {
|
||||
failures.add(aff);
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.util.Arrays;
|
|||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
|
||||
/**
|
||||
* A simple class that stores key Strings as char[]'s in a hash table. Note that this is not a
|
||||
|
@ -266,20 +267,17 @@ public class CharArrayMap<V> extends AbstractMap<Object, V> {
|
|||
|
||||
private int getHashCode(char[] text, int offset, int len) {
|
||||
if (text == null) throw new NullPointerException();
|
||||
int code = 0;
|
||||
final int stop = offset + len;
|
||||
if (ignoreCase) {
|
||||
int stop = offset + len;
|
||||
int code = 0;
|
||||
for (int i = offset; i < stop; ) {
|
||||
final int codePointAt = Character.codePointAt(text, i, stop);
|
||||
code = code * 31 + Character.toLowerCase(codePointAt);
|
||||
i += Character.charCount(codePointAt);
|
||||
}
|
||||
} else {
|
||||
for (int i = offset; i < stop; i++) {
|
||||
code = code * 31 + text[i];
|
||||
}
|
||||
return code;
|
||||
}
|
||||
return code;
|
||||
return CharsRef.stringHashCode(text, offset, len);
|
||||
}
|
||||
|
||||
private int getHashCode(CharSequence text) {
|
||||
|
|
|
@ -74,11 +74,18 @@ public final class CharsRef implements Comparable<CharsRef>, CharSequence, Clone
|
|||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
return stringHashCode(chars, offset, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the hash code of the given char sub-array, calculated by {@link String#hashCode()}
|
||||
* specification
|
||||
*/
|
||||
public static int stringHashCode(char[] chars, int offset, int length) {
|
||||
int end = offset + length;
|
||||
int result = 0;
|
||||
final int end = offset + length;
|
||||
for (int i = offset; i < end; i++) {
|
||||
result = prime * result + chars[i];
|
||||
result = 31 * result + chars[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue