mirror of https://github.com/apache/lucene.git
hunspell: speed up GeneratingSuggester by not deserializing non-suggestible roots (#11859)
This commit is contained in:
parent
f3d85be476
commit
05971b3315
|
@ -60,7 +60,7 @@ Improvements
|
||||||
Optimizations
|
Optimizations
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
* GITHUB#11857: Hunspell: improved suggestion performance
|
* GITHUB#11857, GITHUB#11859: Hunspell: improved suggestion performance
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
---------------------
|
---------------------
|
||||||
|
|
|
@ -49,6 +49,7 @@ import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
import org.apache.lucene.codecs.CodecUtil;
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IOContext;
|
import org.apache.lucene.store.IOContext;
|
||||||
|
@ -1131,7 +1132,8 @@ public class Dictionary {
|
||||||
|
|
||||||
Map<String, Integer> morphIndices = new HashMap<>();
|
Map<String, Integer> morphIndices = new HashMap<>();
|
||||||
|
|
||||||
WordStorage.Builder builder = new WordStorage.Builder(wordCount, hasCustomMorphData, flags);
|
WordStorage.Builder builder =
|
||||||
|
new WordStorage.Builder(wordCount, hasCustomMorphData, flags, allNonSuggestibleFlags());
|
||||||
|
|
||||||
try (ByteSequencesReader reader =
|
try (ByteSequencesReader reader =
|
||||||
new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
|
new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
|
||||||
|
@ -1197,6 +1199,13 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
char[] allNonSuggestibleFlags() {
|
||||||
|
return Dictionary.toSortedCharArray(
|
||||||
|
Stream.of(HIDDEN_FLAG, noSuggest, forbiddenword, onlyincompound, subStandard)
|
||||||
|
.filter(c -> c != FLAG_UNSET)
|
||||||
|
.collect(Collectors.toSet()));
|
||||||
|
}
|
||||||
|
|
||||||
private List<String> readMorphFields(String word, String unparsed) {
|
private List<String> readMorphFields(String word, String unparsed) {
|
||||||
List<String> morphFields = null;
|
List<String> morphFields = null;
|
||||||
for (String datum : splitMorphData(unparsed)) {
|
for (String datum : splitMorphData(unparsed)) {
|
||||||
|
|
|
@ -19,8 +19,6 @@ package org.apache.lucene.analysis.hunspell;
|
||||||
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_APPEND;
|
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_APPEND;
|
||||||
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
|
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
|
||||||
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_STRIP_ORD;
|
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_STRIP_ORD;
|
||||||
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
|
|
||||||
import static org.apache.lucene.analysis.hunspell.Dictionary.HIDDEN_FLAG;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
|
@ -31,7 +29,6 @@ import java.util.PriorityQueue;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
|
||||||
|
@ -73,19 +70,10 @@ class GeneratingSuggester {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
dictionary.words.processAllWords(
|
dictionary.words.processSuggestibleWords(
|
||||||
Math.max(1, word.length() - 4),
|
Math.max(1, word.length() - MAX_ROOT_LENGTH_DIFF),
|
||||||
word.length() + 4,
|
word.length() + MAX_ROOT_LENGTH_DIFF,
|
||||||
(rootChars, forms) -> {
|
(rootChars, forms) -> {
|
||||||
assert rootChars.length > 0;
|
|
||||||
if (Math.abs(rootChars.length - word.length()) > MAX_ROOT_LENGTH_DIFF) {
|
|
||||||
assert rootChars.length < word.length(); // processAllWords takes care of longer keys
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
int suitable = filter.findSuitableFormIndex(forms, 0);
|
|
||||||
if (suitable < 0) return;
|
|
||||||
|
|
||||||
if (ignoreTitleCaseRoots
|
if (ignoreTitleCaseRoots
|
||||||
&& Character.isUpperCase(rootChars.charAt(0))
|
&& Character.isUpperCase(rootChars.charAt(0))
|
||||||
&& WordCase.caseOf(rootChars) == WordCase.TITLE) {
|
&& WordCase.caseOf(rootChars) == WordCase.TITLE) {
|
||||||
|
@ -99,13 +87,14 @@ class GeneratingSuggester {
|
||||||
|
|
||||||
sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);
|
sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);
|
||||||
|
|
||||||
if (roots.size() == MAX_ROOTS && sc < roots.peek().score) {
|
if (roots.size() == MAX_ROOTS && sc <= roots.peek().score) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
speller.checkCanceled.run();
|
speller.checkCanceled.run();
|
||||||
|
|
||||||
String root = rootChars.toString();
|
String root = rootChars.toString();
|
||||||
|
int suitable = filter.findSuitableFormIndex(forms, 0);
|
||||||
do {
|
do {
|
||||||
roots.add(new Weighted<>(new Root<>(root, forms.ints[forms.offset + suitable]), sc));
|
roots.add(new Weighted<>(new Root<>(root, forms.ints[forms.offset + suitable]), sc));
|
||||||
suitable = filter.findSuitableFormIndex(forms, suitable + filter.formStep);
|
suitable = filter.findSuitableFormIndex(forms, suitable + filter.formStep);
|
||||||
|
@ -126,11 +115,7 @@ class GeneratingSuggester {
|
||||||
EntryFilter(Dictionary dic) {
|
EntryFilter(Dictionary dic) {
|
||||||
formStep = dic.formStep();
|
formStep = dic.formStep();
|
||||||
flagLookup = dic.flagLookup;
|
flagLookup = dic.flagLookup;
|
||||||
|
excludeFlags = dic.allNonSuggestibleFlags();
|
||||||
Character[] flags = {HIDDEN_FLAG, dic.noSuggest, dic.forbiddenword, dic.onlyincompound};
|
|
||||||
excludeFlags =
|
|
||||||
Dictionary.toSortedCharArray(
|
|
||||||
Stream.of(flags).filter(c -> c != FLAG_UNSET).collect(Collectors.toSet()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int findSuitableFormIndex(IntsRef forms, int start) {
|
int findSuitableFormIndex(IntsRef forms, int start) {
|
||||||
|
|
|
@ -52,7 +52,8 @@ class WordStorage {
|
||||||
private static final int OFFSET_BITS = 25;
|
private static final int OFFSET_BITS = 25;
|
||||||
private static final int OFFSET_MASK = (1 << OFFSET_BITS) - 1;
|
private static final int OFFSET_MASK = (1 << OFFSET_BITS) - 1;
|
||||||
private static final int COLLISION_MASK = 0x40;
|
private static final int COLLISION_MASK = 0x40;
|
||||||
private static final int MAX_STORED_LENGTH = COLLISION_MASK - 1;
|
private static final int SUGGESTIBLE_MASK = 0x20;
|
||||||
|
private static final int MAX_STORED_LENGTH = SUGGESTIBLE_MASK - 1;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A map from word's hash (modulo array's length) into an int containing:
|
* A map from word's hash (modulo array's length) into an int containing:
|
||||||
|
@ -60,9 +61,10 @@ class WordStorage {
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>lower {@link #OFFSET_BITS}: the offset in {@link #wordData} of the last entry with this
|
* <li>lower {@link #OFFSET_BITS}: the offset in {@link #wordData} of the last entry with this
|
||||||
* hash
|
* hash
|
||||||
* <li>the remaining highest bits: COLLISION+LENGTH info for that entry, i.e. one bit indicating
|
* <li>the remaining highest bits: COLLISION+SUGGESTIBLE+LENGTH info for that entry, i.e. one
|
||||||
* whether there are other entries with the same hash, and the length of the entry in chars,
|
* bit indicating whether there are other entries with the same hash, one bit indicating
|
||||||
* or {@link #MAX_STORED_LENGTH} if the length exceeds that limit (next highest bits)
|
* whether this entry makes sense to be used in suggestions, and the length of the entry in
|
||||||
|
* chars, or {@link #MAX_STORED_LENGTH} if the length exceeds that limit (next highest bits)
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
private final int[] hashTable;
|
private final int[] hashTable;
|
||||||
|
@ -77,8 +79,8 @@ class WordStorage {
|
||||||
* single-character entries
|
* single-character entries
|
||||||
* <li>(Optional, for hash-colliding entries only)
|
* <li>(Optional, for hash-colliding entries only)
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>BYTE: COLLISION+LENGTH info (see {@link #hashTable}) for the previous entry with
|
* <li>BYTE: COLLISION+SUGGESTIBLE+LENGTH info (see {@link #hashTable}) for the previous
|
||||||
* the same hash
|
* entry with the same hash
|
||||||
* <li>VINT: (delta) pointer to the previous entry
|
* <li>VINT: (delta) pointer to the previous entry
|
||||||
* </ul>
|
* </ul>
|
||||||
* <li>(Optional, for non-leaf entries only) VINT+: word form data, returned from {@link
|
* <li>(Optional, for non-leaf entries only) VINT+: word form data, returned from {@link
|
||||||
|
@ -140,12 +142,18 @@ class WordStorage {
|
||||||
return (mask & COLLISION_MASK) != 0;
|
return (mask & COLLISION_MASK) != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static boolean hasSuggestibleEntries(int mask) {
|
||||||
|
return (mask & SUGGESTIBLE_MASK) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calls the processor for every dictionary entry with length between minLength and maxLength,
|
* Calls the processor for every dictionary entry with length between minLength and maxLength,
|
||||||
* both ends inclusive. Note that the callback arguments (word and forms) are reused, so they can
|
* both ends inclusive, and at least one suggestible alternative (without NOSUGGEST, FORBIDDENWORD
|
||||||
* be modified in any way, but may not be saved for later by the processor
|
* or ONLYINCOMPOUND flags). Note that the callback arguments (word and forms) are reused, so they
|
||||||
|
* can be modified in any way, but may not be saved for later by the processor
|
||||||
*/
|
*/
|
||||||
void processAllWords(int minLength, int maxLength, BiConsumer<CharsRef, IntsRef> processor) {
|
void processSuggestibleWords(
|
||||||
|
int minLength, int maxLength, BiConsumer<CharsRef, IntsRef> processor) {
|
||||||
assert minLength <= maxLength;
|
assert minLength <= maxLength;
|
||||||
CharsRef chars = new CharsRef(maxLength);
|
CharsRef chars = new CharsRef(maxLength);
|
||||||
IntsRef forms = new IntsRef();
|
IntsRef forms = new IntsRef();
|
||||||
|
@ -162,7 +170,8 @@ class WordStorage {
|
||||||
int prevPos = pos - in.readVInt();
|
int prevPos = pos - in.readVInt();
|
||||||
|
|
||||||
boolean last = !hasCollision(mask);
|
boolean last = !hasCollision(mask);
|
||||||
boolean mightMatch = hasLengthInRange(mask, minLength, maxLength);
|
boolean mightMatch =
|
||||||
|
hasSuggestibleEntries(mask) && hasLengthInRange(mask, minLength, maxLength);
|
||||||
|
|
||||||
if (!last) {
|
if (!last) {
|
||||||
mask = in.readByte();
|
mask = in.readByte();
|
||||||
|
@ -235,6 +244,7 @@ class WordStorage {
|
||||||
private final boolean hasCustomMorphData;
|
private final boolean hasCustomMorphData;
|
||||||
private final int[] hashTable;
|
private final int[] hashTable;
|
||||||
private byte[] wordData;
|
private byte[] wordData;
|
||||||
|
private final char[] noSuggestFlags;
|
||||||
private final int[] chainLengths;
|
private final int[] chainLengths;
|
||||||
|
|
||||||
private final IntsRefBuilder currentOrds = new IntsRefBuilder();
|
private final IntsRefBuilder currentOrds = new IntsRefBuilder();
|
||||||
|
@ -253,10 +263,15 @@ class WordStorage {
|
||||||
* pre-size the hash table. This argument can be a bit larger than the actual word count,
|
* pre-size the hash table. This argument can be a bit larger than the actual word count,
|
||||||
* but not smaller.
|
* but not smaller.
|
||||||
*/
|
*/
|
||||||
Builder(int wordCount, boolean hasCustomMorphData, FlagEnumerator flagEnumerator) {
|
Builder(
|
||||||
|
int wordCount,
|
||||||
|
boolean hasCustomMorphData,
|
||||||
|
FlagEnumerator flagEnumerator,
|
||||||
|
char[] noSuggestFlags) {
|
||||||
this.wordCount = wordCount;
|
this.wordCount = wordCount;
|
||||||
this.flagEnumerator = flagEnumerator;
|
this.flagEnumerator = flagEnumerator;
|
||||||
this.hasCustomMorphData = hasCustomMorphData;
|
this.hasCustomMorphData = hasCustomMorphData;
|
||||||
|
this.noSuggestFlags = noSuggestFlags;
|
||||||
|
|
||||||
hashTable = new int[wordCount];
|
hashTable = new int[wordCount];
|
||||||
wordData = new byte[wordCount * 6];
|
wordData = new byte[wordCount * 6];
|
||||||
|
@ -317,7 +332,7 @@ class WordStorage {
|
||||||
currentOrds.clear();
|
currentOrds.clear();
|
||||||
boolean hasNonHidden = false;
|
boolean hasNonHidden = false;
|
||||||
for (char[] flags : group) {
|
for (char[] flags : group) {
|
||||||
if (!hasHiddenFlag(flags)) {
|
if (!hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
|
||||||
hasNonHidden = true;
|
hasNonHidden = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -325,7 +340,7 @@ class WordStorage {
|
||||||
|
|
||||||
for (int i = 0; i < group.size(); i++) {
|
for (int i = 0; i < group.size(); i++) {
|
||||||
char[] flags = group.get(i);
|
char[] flags = group.get(i);
|
||||||
if (hasNonHidden && hasHiddenFlag(flags)) {
|
if (hasNonHidden && hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -353,7 +368,9 @@ class WordStorage {
|
||||||
int prevCode = hashTable[hash];
|
int prevCode = hashTable[hash];
|
||||||
|
|
||||||
int mask =
|
int mask =
|
||||||
(prevCode == 0 ? 0 : COLLISION_MASK) | Math.min(currentEntry.length(), MAX_STORED_LENGTH);
|
(prevCode == 0 ? 0 : COLLISION_MASK)
|
||||||
|
| (group.stream().anyMatch(flags -> !hasNoSuggestFlag(flags)) ? SUGGESTIBLE_MASK : 0)
|
||||||
|
| Math.min(currentEntry.length(), MAX_STORED_LENGTH);
|
||||||
hashTable[hash] = (mask << OFFSET_BITS) | pos;
|
hashTable[hash] = (mask << OFFSET_BITS) | pos;
|
||||||
|
|
||||||
if (++chainLengths[hash] > 20) {
|
if (++chainLengths[hash] > 20) {
|
||||||
|
@ -375,9 +392,16 @@ class WordStorage {
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean hasHiddenFlag(char[] flags) {
|
private boolean hasNoSuggestFlag(char[] flags) {
|
||||||
for (char flag : flags) {
|
for (char flag : flags) {
|
||||||
if (flag == Dictionary.HIDDEN_FLAG) {
|
if (hasFlag(noSuggestFlags, flag)) return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean hasFlag(char[] flags, char flag) {
|
||||||
|
for (char f : flags) {
|
||||||
|
if (f == flag) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -86,11 +86,16 @@ public class TestDictionary extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testProcessSuggestibleWords() throws Exception {
|
||||||
|
Dictionary dictionary = loadDictionary("suggestible.aff", "suggestible.dic");
|
||||||
|
|
||||||
|
Set<String> processed = processSuggestibleWords(dictionary, 1, 100);
|
||||||
|
assertEquals(Set.of("normal", "ambiguous"), processed);
|
||||||
|
}
|
||||||
|
|
||||||
private void checkProcessWords(
|
private void checkProcessWords(
|
||||||
Dictionary dictionary, Set<String> allWords, int minLength, int maxLength) {
|
Dictionary dictionary, Set<String> allWords, int minLength, int maxLength) {
|
||||||
Set<String> processed = new HashSet<>();
|
Set<String> processed = processSuggestibleWords(dictionary, minLength, maxLength);
|
||||||
dictionary.words.processAllWords(
|
|
||||||
minLength, maxLength, (word, __) -> processed.add(word.toString()));
|
|
||||||
|
|
||||||
Set<String> filtered =
|
Set<String> filtered =
|
||||||
allWords.stream()
|
allWords.stream()
|
||||||
|
@ -100,6 +105,14 @@ public class TestDictionary extends LuceneTestCase {
|
||||||
assertEquals("For lengths [" + minLength + "," + maxLength + "]", filtered, processed);
|
assertEquals("For lengths [" + minLength + "," + maxLength + "]", filtered, processed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static Set<String> processSuggestibleWords(
|
||||||
|
Dictionary dictionary, int minLength, int maxLength) {
|
||||||
|
Set<String> processed = new HashSet<>();
|
||||||
|
dictionary.words.processSuggestibleWords(
|
||||||
|
minLength, maxLength, (word, __) -> processed.add(word.toString()));
|
||||||
|
return processed;
|
||||||
|
}
|
||||||
|
|
||||||
public void testCompressedDictionary() throws Exception {
|
public void testCompressedDictionary() throws Exception {
|
||||||
Dictionary dictionary = loadDictionary("compressed.aff", "compressed.dic");
|
Dictionary dictionary = loadDictionary("compressed.aff", "compressed.dic");
|
||||||
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length);
|
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length);
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
ONLYINCOMPOUND O
|
||||||
|
NOSUGGEST N
|
||||||
|
FORBIDDENWORD F
|
||||||
|
SUBSTANDARD S
|
|
@ -0,0 +1,8 @@
|
||||||
|
1
|
||||||
|
normal
|
||||||
|
compound/O
|
||||||
|
forbidden/F
|
||||||
|
nosuggest/N
|
||||||
|
substandard/S
|
||||||
|
ambiguous
|
||||||
|
ambiguous/N
|
Loading…
Reference in New Issue