hunspell: speed up GeneratingSuggester by not deserializing non-suggestible roots (#11859)

This commit is contained in:
Peter Gromov 2022-10-19 13:17:43 +02:00 committed by GitHub
parent f3d85be476
commit 05971b3315
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 85 additions and 42 deletions

View File

@ -60,7 +60,7 @@ Improvements
Optimizations Optimizations
--------------------- ---------------------
* GITHUB#11857: Hunspell: improved suggestion performance * GITHUB#11857, GITHUB#11859: Hunspell: improved suggestion performance
Bug Fixes Bug Fixes
--------------------- ---------------------

View File

@ -49,6 +49,7 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IOContext;
@ -1131,7 +1132,8 @@ public class Dictionary {
Map<String, Integer> morphIndices = new HashMap<>(); Map<String, Integer> morphIndices = new HashMap<>();
WordStorage.Builder builder = new WordStorage.Builder(wordCount, hasCustomMorphData, flags); WordStorage.Builder builder =
new WordStorage.Builder(wordCount, hasCustomMorphData, flags, allNonSuggestibleFlags());
try (ByteSequencesReader reader = try (ByteSequencesReader reader =
new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) { new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
@ -1197,6 +1199,13 @@ public class Dictionary {
} }
} }
char[] allNonSuggestibleFlags() {
return Dictionary.toSortedCharArray(
Stream.of(HIDDEN_FLAG, noSuggest, forbiddenword, onlyincompound, subStandard)
.filter(c -> c != FLAG_UNSET)
.collect(Collectors.toSet()));
}
private List<String> readMorphFields(String word, String unparsed) { private List<String> readMorphFields(String word, String unparsed) {
List<String> morphFields = null; List<String> morphFields = null;
for (String datum : splitMorphData(unparsed)) { for (String datum : splitMorphData(unparsed)) {

View File

@ -19,8 +19,6 @@ package org.apache.lucene.analysis.hunspell;
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_APPEND; import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_APPEND;
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG; import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_STRIP_ORD; import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_STRIP_ORD;
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
import static org.apache.lucene.analysis.hunspell.Dictionary.HIDDEN_FLAG;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
@ -31,7 +29,6 @@ import java.util.PriorityQueue;
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
@ -73,19 +70,10 @@ class GeneratingSuggester {
} }
}; };
dictionary.words.processAllWords( dictionary.words.processSuggestibleWords(
Math.max(1, word.length() - 4), Math.max(1, word.length() - MAX_ROOT_LENGTH_DIFF),
word.length() + 4, word.length() + MAX_ROOT_LENGTH_DIFF,
(rootChars, forms) -> { (rootChars, forms) -> {
assert rootChars.length > 0;
if (Math.abs(rootChars.length - word.length()) > MAX_ROOT_LENGTH_DIFF) {
assert rootChars.length < word.length(); // processAllWords takes care of longer keys
return;
}
int suitable = filter.findSuitableFormIndex(forms, 0);
if (suitable < 0) return;
if (ignoreTitleCaseRoots if (ignoreTitleCaseRoots
&& Character.isUpperCase(rootChars.charAt(0)) && Character.isUpperCase(rootChars.charAt(0))
&& WordCase.caseOf(rootChars) == WordCase.TITLE) { && WordCase.caseOf(rootChars) == WordCase.TITLE) {
@ -99,13 +87,14 @@ class GeneratingSuggester {
sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length); sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);
if (roots.size() == MAX_ROOTS && sc < roots.peek().score) { if (roots.size() == MAX_ROOTS && sc <= roots.peek().score) {
return; return;
} }
speller.checkCanceled.run(); speller.checkCanceled.run();
String root = rootChars.toString(); String root = rootChars.toString();
int suitable = filter.findSuitableFormIndex(forms, 0);
do { do {
roots.add(new Weighted<>(new Root<>(root, forms.ints[forms.offset + suitable]), sc)); roots.add(new Weighted<>(new Root<>(root, forms.ints[forms.offset + suitable]), sc));
suitable = filter.findSuitableFormIndex(forms, suitable + filter.formStep); suitable = filter.findSuitableFormIndex(forms, suitable + filter.formStep);
@ -126,11 +115,7 @@ class GeneratingSuggester {
EntryFilter(Dictionary dic) { EntryFilter(Dictionary dic) {
formStep = dic.formStep(); formStep = dic.formStep();
flagLookup = dic.flagLookup; flagLookup = dic.flagLookup;
excludeFlags = dic.allNonSuggestibleFlags();
Character[] flags = {HIDDEN_FLAG, dic.noSuggest, dic.forbiddenword, dic.onlyincompound};
excludeFlags =
Dictionary.toSortedCharArray(
Stream.of(flags).filter(c -> c != FLAG_UNSET).collect(Collectors.toSet()));
} }
int findSuitableFormIndex(IntsRef forms, int start) { int findSuitableFormIndex(IntsRef forms, int start) {

View File

@ -52,7 +52,8 @@ class WordStorage {
private static final int OFFSET_BITS = 25; private static final int OFFSET_BITS = 25;
private static final int OFFSET_MASK = (1 << OFFSET_BITS) - 1; private static final int OFFSET_MASK = (1 << OFFSET_BITS) - 1;
private static final int COLLISION_MASK = 0x40; private static final int COLLISION_MASK = 0x40;
private static final int MAX_STORED_LENGTH = COLLISION_MASK - 1; private static final int SUGGESTIBLE_MASK = 0x20;
private static final int MAX_STORED_LENGTH = SUGGESTIBLE_MASK - 1;
/** /**
* A map from word's hash (modulo array's length) into an int containing: * A map from word's hash (modulo array's length) into an int containing:
@ -60,9 +61,10 @@ class WordStorage {
* <ul> * <ul>
* <li>lower {@link #OFFSET_BITS}: the offset in {@link #wordData} of the last entry with this * <li>lower {@link #OFFSET_BITS}: the offset in {@link #wordData} of the last entry with this
* hash * hash
* <li>the remaining highest bits: COLLISION+LENGTH info for that entry, i.e. one bit indicating * <li>the remaining highest bits: COLLISION+SUGGESTIBLE+LENGTH info for that entry, i.e. one
* whether there are other entries with the same hash, and the length of the entry in chars, * bit indicating whether there are other entries with the same hash, one bit indicating
* or {@link #MAX_STORED_LENGTH} if the length exceeds that limit (next highest bits) * whether this entry makes sense to be used in suggestions, and the length of the entry in
* chars, or {@link #MAX_STORED_LENGTH} if the length exceeds that limit (next highest bits)
* </ul> * </ul>
*/ */
private final int[] hashTable; private final int[] hashTable;
@ -77,8 +79,8 @@ class WordStorage {
* single-character entries * single-character entries
* <li>(Optional, for hash-colliding entries only) * <li>(Optional, for hash-colliding entries only)
* <ul> * <ul>
* <li>BYTE: COLLISION+LENGTH info (see {@link #hashTable}) for the previous entry with * <li>BYTE: COLLISION+SUGGESTIBLE+LENGTH info (see {@link #hashTable}) for the previous
* the same hash * entry with the same hash
* <li>VINT: (delta) pointer to the previous entry * <li>VINT: (delta) pointer to the previous entry
* </ul> * </ul>
* <li>(Optional, for non-leaf entries only) VINT+: word form data, returned from {@link * <li>(Optional, for non-leaf entries only) VINT+: word form data, returned from {@link
@ -140,12 +142,18 @@ class WordStorage {
return (mask & COLLISION_MASK) != 0; return (mask & COLLISION_MASK) != 0;
} }
private static boolean hasSuggestibleEntries(int mask) {
return (mask & SUGGESTIBLE_MASK) != 0;
}
/** /**
* Calls the processor for every dictionary entry with length between minLength and maxLength, * Calls the processor for every dictionary entry with length between minLength and maxLength,
* both ends inclusive. Note that the callback arguments (word and forms) are reused, so they can * both ends inclusive, and at least one suggestible alternative (without NOSUGGEST, FORBIDDENWORD
* be modified in any way, but may not be saved for later by the processor * or ONLYINCOMPOUND flags). Note that the callback arguments (word and forms) are reused, so they
* can be modified in any way, but may not be saved for later by the processor
*/ */
void processAllWords(int minLength, int maxLength, BiConsumer<CharsRef, IntsRef> processor) { void processSuggestibleWords(
int minLength, int maxLength, BiConsumer<CharsRef, IntsRef> processor) {
assert minLength <= maxLength; assert minLength <= maxLength;
CharsRef chars = new CharsRef(maxLength); CharsRef chars = new CharsRef(maxLength);
IntsRef forms = new IntsRef(); IntsRef forms = new IntsRef();
@ -162,7 +170,8 @@ class WordStorage {
int prevPos = pos - in.readVInt(); int prevPos = pos - in.readVInt();
boolean last = !hasCollision(mask); boolean last = !hasCollision(mask);
boolean mightMatch = hasLengthInRange(mask, minLength, maxLength); boolean mightMatch =
hasSuggestibleEntries(mask) && hasLengthInRange(mask, minLength, maxLength);
if (!last) { if (!last) {
mask = in.readByte(); mask = in.readByte();
@ -235,6 +244,7 @@ class WordStorage {
private final boolean hasCustomMorphData; private final boolean hasCustomMorphData;
private final int[] hashTable; private final int[] hashTable;
private byte[] wordData; private byte[] wordData;
private final char[] noSuggestFlags;
private final int[] chainLengths; private final int[] chainLengths;
private final IntsRefBuilder currentOrds = new IntsRefBuilder(); private final IntsRefBuilder currentOrds = new IntsRefBuilder();
@ -253,10 +263,15 @@ class WordStorage {
* pre-size the hash table. This argument can be a bit larger than the actual word count, * pre-size the hash table. This argument can be a bit larger than the actual word count,
* but not smaller. * but not smaller.
*/ */
Builder(int wordCount, boolean hasCustomMorphData, FlagEnumerator flagEnumerator) { Builder(
int wordCount,
boolean hasCustomMorphData,
FlagEnumerator flagEnumerator,
char[] noSuggestFlags) {
this.wordCount = wordCount; this.wordCount = wordCount;
this.flagEnumerator = flagEnumerator; this.flagEnumerator = flagEnumerator;
this.hasCustomMorphData = hasCustomMorphData; this.hasCustomMorphData = hasCustomMorphData;
this.noSuggestFlags = noSuggestFlags;
hashTable = new int[wordCount]; hashTable = new int[wordCount];
wordData = new byte[wordCount * 6]; wordData = new byte[wordCount * 6];
@ -317,7 +332,7 @@ class WordStorage {
currentOrds.clear(); currentOrds.clear();
boolean hasNonHidden = false; boolean hasNonHidden = false;
for (char[] flags : group) { for (char[] flags : group) {
if (!hasHiddenFlag(flags)) { if (!hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
hasNonHidden = true; hasNonHidden = true;
break; break;
} }
@ -325,7 +340,7 @@ class WordStorage {
for (int i = 0; i < group.size(); i++) { for (int i = 0; i < group.size(); i++) {
char[] flags = group.get(i); char[] flags = group.get(i);
if (hasNonHidden && hasHiddenFlag(flags)) { if (hasNonHidden && hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
continue; continue;
} }
@ -353,7 +368,9 @@ class WordStorage {
int prevCode = hashTable[hash]; int prevCode = hashTable[hash];
int mask = int mask =
(prevCode == 0 ? 0 : COLLISION_MASK) | Math.min(currentEntry.length(), MAX_STORED_LENGTH); (prevCode == 0 ? 0 : COLLISION_MASK)
| (group.stream().anyMatch(flags -> !hasNoSuggestFlag(flags)) ? SUGGESTIBLE_MASK : 0)
| Math.min(currentEntry.length(), MAX_STORED_LENGTH);
hashTable[hash] = (mask << OFFSET_BITS) | pos; hashTable[hash] = (mask << OFFSET_BITS) | pos;
if (++chainLengths[hash] > 20) { if (++chainLengths[hash] > 20) {
@ -375,9 +392,16 @@ class WordStorage {
return pos; return pos;
} }
private static boolean hasHiddenFlag(char[] flags) { private boolean hasNoSuggestFlag(char[] flags) {
for (char flag : flags) { for (char flag : flags) {
if (flag == Dictionary.HIDDEN_FLAG) { if (hasFlag(noSuggestFlags, flag)) return true;
}
return false;
}
private static boolean hasFlag(char[] flags, char flag) {
for (char f : flags) {
if (f == flag) {
return true; return true;
} }
} }

View File

@ -86,11 +86,16 @@ public class TestDictionary extends LuceneTestCase {
} }
} }
public void testProcessSuggestibleWords() throws Exception {
Dictionary dictionary = loadDictionary("suggestible.aff", "suggestible.dic");
Set<String> processed = processSuggestibleWords(dictionary, 1, 100);
assertEquals(Set.of("normal", "ambiguous"), processed);
}
private void checkProcessWords( private void checkProcessWords(
Dictionary dictionary, Set<String> allWords, int minLength, int maxLength) { Dictionary dictionary, Set<String> allWords, int minLength, int maxLength) {
Set<String> processed = new HashSet<>(); Set<String> processed = processSuggestibleWords(dictionary, minLength, maxLength);
dictionary.words.processAllWords(
minLength, maxLength, (word, __) -> processed.add(word.toString()));
Set<String> filtered = Set<String> filtered =
allWords.stream() allWords.stream()
@ -100,6 +105,14 @@ public class TestDictionary extends LuceneTestCase {
assertEquals("For lengths [" + minLength + "," + maxLength + "]", filtered, processed); assertEquals("For lengths [" + minLength + "," + maxLength + "]", filtered, processed);
} }
private static Set<String> processSuggestibleWords(
Dictionary dictionary, int minLength, int maxLength) {
Set<String> processed = new HashSet<>();
dictionary.words.processSuggestibleWords(
minLength, maxLength, (word, __) -> processed.add(word.toString()));
return processed;
}
public void testCompressedDictionary() throws Exception { public void testCompressedDictionary() throws Exception {
Dictionary dictionary = loadDictionary("compressed.aff", "compressed.dic"); Dictionary dictionary = loadDictionary("compressed.aff", "compressed.dic");
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length); assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length);

View File

@ -0,0 +1,4 @@
ONLYINCOMPOUND O
NOSUGGEST N
FORBIDDENWORD F
SUBSTANDARD S

View File

@ -0,0 +1,8 @@
1
normal
compound/O
forbidden/F
nosuggest/N
substandard/S
ambiguous
ambiguous/N