hunspell: speed up the dictionary enumeration (#12447)

* hunspell: speed up the dictionary enumeration

cache each word's case and the lowercase form
group the words by lengths to avoid even visiting entries with unneeded lengths
This commit is contained in:
Peter Gromov 2023-07-18 21:25:26 +02:00 committed by GitHub
parent b4619d87ed
commit f05adff4ca
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 265 additions and 118 deletions

View File

@ -75,6 +75,8 @@ Improvements
* LUCENE-10652: Add a top-n range faceting example to RangeFacetsExample. (Yuting Gan)
* GITHUB#12447: Hunspell: speed up the dictionary enumeration (Peter Gromov)
Optimizations
---------------------

View File

@ -1189,7 +1189,12 @@ public class Dictionary {
// finalize last entry
success = true;
return builder.build();
return new WordStorage(builder) {
@Override
char caseFold(char c) {
return Dictionary.this.caseFold(c);
}
};
} finally {
if (success) {
tempDir.deleteFile(sorted);

View File

@ -0,0 +1,31 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
/** A mutable entry object used when enumerating the dictionary internally */
abstract class FlyweightEntry {
abstract boolean hasTitleCase();
abstract CharsRef root();
abstract CharSequence lowerCaseRoot();
abstract IntsRef forms();
}

View File

@ -28,9 +28,8 @@ import java.util.Objects;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.TreeSet;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import java.util.function.IntPredicate;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
@ -72,29 +71,22 @@ class GeneratingSuggester {
IntPredicate isSuggestible = formId -> !flagLookup.hasAnyFlag(formId, excludeFlags);
boolean ignoreTitleCaseRoots = originalCase == WordCase.LOWER && !dictionary.hasLanguage("de");
TrigramAutomaton automaton =
new TrigramAutomaton(word) {
@Override
char transformChar(char c) {
return dictionary.caseFold(c);
}
};
TrigramAutomaton automaton = new TrigramAutomaton(word);
processSuggestibleWords(
Math.max(1, word.length() - MAX_ROOT_LENGTH_DIFF),
word.length() + MAX_ROOT_LENGTH_DIFF,
(rootChars, formSupplier) -> {
if (ignoreTitleCaseRoots
&& Character.isUpperCase(rootChars.charAt(0))
&& WordCase.caseOf(rootChars) == WordCase.TITLE) {
(entry) -> {
if (ignoreTitleCaseRoots && entry.hasTitleCase()) {
return;
}
int sc = automaton.ngramScore(rootChars);
int sc = automaton.ngramScore(entry.lowerCaseRoot());
if (sc == 0) {
return; // no common characters at all, don't suggest this root
}
CharsRef rootChars = entry.root();
sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);
boolean overflow = roots.size() == MAX_ROOTS;
@ -105,7 +97,7 @@ class GeneratingSuggester {
speller.checkCanceled.run();
String root = rootChars.toString();
IntsRef forms = formSupplier.get();
IntsRef forms = entry.forms();
for (int i = 0; i < forms.length; i++) {
if (isSuggestible.test(forms.ints[forms.offset + i])) {
roots.add(new Weighted<>(new Root<>(root, forms.ints[forms.offset + i]), sc));
@ -125,7 +117,7 @@ class GeneratingSuggester {
}
private void processSuggestibleWords(
int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
int minLength, int maxLength, Consumer<FlyweightEntry> processor) {
if (entryCache != null) {
entryCache.processSuggestibleWords(minLength, maxLength, processor);
} else {

View File

@ -16,8 +16,9 @@
*/
package org.apache.lucene.analysis.hunspell;
import java.util.function.BiConsumer;
import java.util.function.Supplier;
import java.util.HashMap;
import java.util.Map;
import java.util.function.Consumer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
@ -28,74 +29,149 @@ import org.apache.lucene.util.IntsRef;
* compression.
*/
class SuggestibleEntryCache {
private final short[] lengths;
private final char[] roots;
private final int[] formData;
private static final short LOWER_CASE = (short) WordCase.LOWER.ordinal();
private static final short NEUTRAL_CASE = (short) WordCase.NEUTRAL.ordinal();
private static final short TITLE_CASE = (short) WordCase.TITLE.ordinal();
private SuggestibleEntryCache(short[] lengths, char[] roots, int[] formData) {
this.lengths = lengths;
this.roots = roots;
this.formData = formData;
private final Section[] sections;
private SuggestibleEntryCache(Map<Integer, SectionBuilder> builders) {
int maxLength =
builders.isEmpty() ? 0 : builders.keySet().stream().max(Integer::compare).orElseThrow();
sections = new Section[maxLength + 1];
for (int i = 0; i < sections.length; i++) {
SectionBuilder builder = builders.get(i);
sections[i] = builder == null ? null : builder.build(i);
}
}
static SuggestibleEntryCache buildCache(WordStorage storage) {
var consumer =
new BiConsumer<CharsRef, Supplier<IntsRef>>() {
short[] lengths = new short[10];
final StringBuilder roots = new StringBuilder();
int[] formData = new int[10];
int lenOffset = 0;
int formDataOffset = 0;
new Consumer<FlyweightEntry>() {
final Map<Integer, SectionBuilder> builders = new HashMap<>();
@Override
public void accept(CharsRef root, Supplier<IntsRef> formSupplier) {
public void accept(FlyweightEntry entry) {
CharsRef root = entry.root();
if (root.length > Short.MAX_VALUE) {
throw new UnsupportedOperationException(
"Too long dictionary entry, please report this to dev@lucene.apache.org");
}
IntsRef forms = formSupplier.get();
lengths = ArrayUtil.grow(lengths, lenOffset + 2);
lengths[lenOffset] = (short) root.length;
lengths[lenOffset + 1] = (short) forms.length;
lenOffset += 2;
roots.append(root.chars, root.offset, root.length);
formData = ArrayUtil.grow(formData, formDataOffset + forms.length);
System.arraycopy(forms.ints, forms.offset, formData, formDataOffset, forms.length);
formDataOffset += forms.length;
builders.computeIfAbsent(root.length, __ -> new SectionBuilder()).add(entry);
}
};
storage.processSuggestibleWords(1, Integer.MAX_VALUE, consumer);
return new SuggestibleEntryCache(
ArrayUtil.copyOfSubArray(consumer.lengths, 0, consumer.lenOffset),
consumer.roots.toString().toCharArray(),
ArrayUtil.copyOfSubArray(consumer.formData, 0, consumer.formDataOffset));
return new SuggestibleEntryCache(consumer.builders);
}
void processSuggestibleWords(
int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
CharsRef chars = new CharsRef(roots, 0, 0);
IntsRef forms = new IntsRef(formData, 0, 0);
Supplier<IntsRef> formSupplier = () -> forms;
int rootOffset = 0;
int formDataOffset = 0;
for (int i = 0; i < lengths.length; i += 2) {
int rootLength = lengths[i];
short formDataLength = lengths[i + 1];
if (rootLength >= minLength && rootLength <= maxLength) {
chars.offset = rootOffset;
chars.length = rootLength;
forms.offset = formDataOffset;
forms.length = formDataLength;
processor.accept(chars, formSupplier);
private static class SectionBuilder {
final StringBuilder roots = new StringBuilder(), lowRoots = new StringBuilder();
short[] meta = new short[10];
int[] formData = new int[10];
int metaOffset, formDataOffset;
void add(FlyweightEntry entry) {
CharsRef root = entry.root();
if (root.length > Short.MAX_VALUE) {
throw new UnsupportedOperationException(
"Too long dictionary entry, please report this to dev@lucene.apache.org");
}
IntsRef forms = entry.forms();
short rootCase = (short) WordCase.caseOf(root).ordinal();
meta = ArrayUtil.grow(meta, metaOffset + 2);
meta[metaOffset] = (short) forms.length;
meta[metaOffset + 1] = rootCase;
metaOffset += 2;
lowRoots.append(entry.lowerCaseRoot());
if (hasUpperCase(rootCase)) {
roots.append(root.chars, root.offset, root.length);
}
formData = ArrayUtil.grow(formData, formDataOffset + forms.length);
System.arraycopy(forms.ints, forms.offset, formData, formDataOffset, forms.length);
formDataOffset += forms.length;
}
Section build(int rootLength) {
return new Section(
rootLength,
ArrayUtil.copyOfSubArray(meta, 0, metaOffset),
roots.toString().toCharArray(),
lowRoots.toString().toCharArray(),
ArrayUtil.copyOfSubArray(formData, 0, formDataOffset));
}
}
private static boolean hasUpperCase(short rootCase) {
return rootCase != LOWER_CASE && rootCase != NEUTRAL_CASE;
}
void processSuggestibleWords(int minLength, int maxLength, Consumer<FlyweightEntry> processor) {
maxLength = Math.min(maxLength, sections.length - 1);
for (int i = Math.min(minLength, sections.length); i <= maxLength; i++) {
Section section = sections[i];
if (section != null) {
section.processWords(processor);
}
}
}
/**
* @param meta The lengths of the entry sub-arrays in formData plus the case information
* @param roots original roots if they're not all-lowercase
*/
private record Section(
int rootLength, short[] meta, char[] roots, char[] lowRoots, int[] formData) {
void processWords(Consumer<FlyweightEntry> processor) {
CharsRef chars = new CharsRef(roots, 0, Math.min(rootLength, roots.length));
CharsRef lowerChars = new CharsRef(lowRoots, 0, rootLength);
IntsRef forms = new IntsRef(formData, 0, 0);
var entry =
new FlyweightEntry() {
short wordCase;
@Override
CharsRef root() {
return hasUpperCase(wordCase) ? chars : lowerChars;
}
@Override
boolean hasTitleCase() {
return wordCase == TITLE_CASE;
}
@Override
CharSequence lowerCaseRoot() {
return lowerChars;
}
@Override
IntsRef forms() {
return forms;
}
};
for (int i = 0; i < meta.length; i += 2) {
short formDataLength = meta[i];
short wordCase = meta[i + 1];
forms.length = formDataLength;
entry.wordCase = wordCase;
processor.accept(entry);
lowerChars.offset += rootLength;
if (hasUpperCase(wordCase)) {
chars.offset += rootLength;
}
forms.offset += formDataLength;
}
rootOffset += rootLength;
formDataOffset += formDataLength;
}
}
}

View File

@ -18,7 +18,6 @@ package org.apache.lucene.analysis.hunspell;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
@ -78,7 +77,7 @@ class TrigramAutomaton {
return state;
}
int ngramScore(CharsRef s2) {
int ngramScore(CharSequence s2) {
countedSubstrings.clear();
int score1 = 0, score2 = 0, score3 = 0; // scores for substrings of length 1, 2 and 3
@ -86,9 +85,9 @@ class TrigramAutomaton {
// states of running the automaton on substrings [i-1, i) and [i-2, i)
int state1 = -1, state2 = -1;
int limit = s2.length + s2.offset;
for (int i = s2.offset; i < limit; i++) {
char c = transformChar(s2.chars[i]);
int limit = s2.length();
for (int i = 0; i < limit; i++) {
char c = s2.charAt(i);
if (c < minChar) {
state1 = state2 = -1;
continue;
@ -121,10 +120,6 @@ class TrigramAutomaton {
return score;
}
char transformChar(char c) {
return c;
}
private int substringScore(int state, FixedBitSet countedSubstrings) {
if (countedSubstrings.getAndSet(state)) return 0;

View File

@ -300,9 +300,9 @@ public class WordFormGenerator {
1,
Integer.MAX_VALUE,
false,
(root, lazyForms) -> {
String rootStr = root.toString();
IntsRef forms = lazyForms.get();
e -> {
String rootStr = e.root().toString();
IntsRef forms = e.forms();
for (int i = 0; i < forms.length; i += dictionary.formStep()) {
char[] encodedFlags = dictionary.flagLookup.getFlags(forms.ints[forms.offset + i]);
if (shouldConsiderAtAll(encodedFlags)) {

View File

@ -19,8 +19,7 @@ package org.apache.lucene.analysis.hunspell;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.function.BiConsumer;
import java.util.function.Supplier;
import java.util.function.Consumer;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.DataOutput;
@ -49,7 +48,7 @@ import org.apache.lucene.util.fst.IntSequenceOutputs;
* The entries are stored in a contiguous byte array, identified by their offsets, using {@link
* DataOutput#writeVInt} ()} VINT} format for compression.
*/
class WordStorage {
abstract class WordStorage {
private static final int OFFSET_BITS = 25;
private static final int OFFSET_MASK = (1 << OFFSET_BITS) - 1;
private static final int COLLISION_MASK = 0x40;
@ -91,12 +90,15 @@ class WordStorage {
*/
private final byte[] wordData;
private WordStorage(
int maxEntryLength, boolean hasCustomMorphData, int[] hashTable, byte[] wordData) {
this.maxEntryLength = maxEntryLength;
this.hasCustomMorphData = hasCustomMorphData;
this.hashTable = hashTable;
this.wordData = wordData;
WordStorage(Builder builder) throws IOException {
if (builder.hashTable.length > 0) {
assert !builder.group.isEmpty() : "WordStorage builder should be only used once";
builder.flushGroup();
}
this.maxEntryLength = builder.maxEntryLength;
this.hasCustomMorphData = builder.hasCustomMorphData;
this.hashTable = builder.hashTable.length == 0 ? new int[1] : builder.hashTable;
this.wordData = ArrayUtil.copyOfSubArray(builder.wordData, 0, builder.dataWriter.getPosition());
}
IntsRef lookupWord(char[] word, int offset, int length) {
@ -157,22 +159,20 @@ class WordStorage {
* or ONLYINCOMPOUND flags). Note that the callback arguments (word and forms) are reused, so they
* can be modified in any way, but may not be saved for later by the processor
*/
void processSuggestibleWords(
int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
void processSuggestibleWords(int minLength, int maxLength, Consumer<FlyweightEntry> processor) {
processAllWords(minLength, maxLength, true, processor);
}
void processAllWords(
int minLength,
int maxLength,
boolean suggestibleOnly,
BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
int minLength, int maxLength, boolean suggestibleOnly, Consumer<FlyweightEntry> processor) {
assert minLength <= maxLength;
maxLength = Math.min(maxEntryLength, maxLength);
CharsRef chars = new CharsRef(maxLength);
ByteArrayDataInput in = new ByteArrayDataInput(wordData);
var formSupplier = new LazyFormReader(in);
var entry = new MyFlyweightEntry(chars, in);
for (int entryCode : hashTable) {
int pos = entryCode & OFFSET_MASK;
int mask = entryCode >>> OFFSET_BITS;
@ -195,7 +195,7 @@ class WordStorage {
}
if (mightMatch) {
formSupplier.dataPos = in.getPosition();
entry.dataPos = in.getPosition();
while (prevPos != 0 && wordStart > 0) {
in.setPosition(prevPos);
chars.chars[--wordStart] = (char) in.readVInt();
@ -205,7 +205,7 @@ class WordStorage {
if (prevPos == 0) {
chars.offset = wordStart;
chars.length = maxLength - wordStart;
processor.accept(chars, formSupplier);
processor.accept(entry);
}
}
@ -422,30 +422,61 @@ class WordStorage {
}
return false;
}
WordStorage build() throws IOException {
if (hashTable.length > 0) {
assert !group.isEmpty() : "build() should be only called once";
flushGroup();
}
byte[] trimmedData = ArrayUtil.copyOfSubArray(wordData, 0, dataWriter.getPosition());
int[] table = hashTable.length == 0 ? new int[1] : hashTable;
return new WordStorage(maxEntryLength, hasCustomMorphData, table, trimmedData);
}
}
private class LazyFormReader implements Supplier<IntsRef> {
int dataPos;
private final ByteArrayDataInput in;
private final IntsRef forms;
abstract char caseFold(char c);
LazyFormReader(ByteArrayDataInput in) {
private class MyFlyweightEntry extends FlyweightEntry {
private final CharsRef chars;
private final ByteArrayDataInput in;
int dataPos;
private final IntsRef forms = new IntsRef();
private final CharSequence lower;
MyFlyweightEntry(CharsRef chars, ByteArrayDataInput in) {
this.chars = chars;
this.in = in;
forms = new IntsRef();
lower =
new CharSequence() {
@Override
public int length() {
return chars.length;
}
@Override
public char charAt(int index) {
return caseFold(chars.chars[index + chars.offset]);
}
@Override
public CharSequence subSequence(int start, int end) {
throw new UnsupportedOperationException();
}
@Override
public String toString() {
throw new UnsupportedOperationException();
}
};
}
@Override
public IntsRef get() {
boolean hasTitleCase() {
return Character.isUpperCase(chars.charAt(0)) && WordCase.caseOf(chars) == WordCase.TITLE;
}
@Override
CharsRef root() {
return chars;
}
@Override
CharSequence lowerCaseRoot() {
return lower;
}
@Override
IntsRef forms() {
in.setPosition(dataPos);
int entryCount = in.readVInt() / (hasCustomMorphData ? 2 : 1);
if (forms.ints.length < entryCount) {

View File

@ -109,11 +109,11 @@ public class TestDictionary extends LuceneTestCase {
Dictionary dictionary, int minLength, int maxLength) {
Set<String> processed = new HashSet<>();
dictionary.words.processSuggestibleWords(
minLength, maxLength, (word, __) -> processed.add(word.toString()));
minLength, maxLength, e -> processed.add(e.root().toString()));
Set<String> cached = new HashSet<>();
SuggestibleEntryCache.buildCache(dictionary.words)
.processSuggestibleWords(minLength, maxLength, (word, __) -> cached.add(word.toString()));
.processSuggestibleWords(minLength, maxLength, e -> cached.add(e.root().toString()));
assertEquals(processed, cached);
return processed;

View File

@ -99,6 +99,16 @@ public class TestPerformance extends LuceneTestCase {
checkSuggestionPerformance("fr", 1_000);
}
@Test
public void uk() throws Exception {
checkAnalysisPerformance("uk", 200_000);
}
@Test
public void uk_suggest() throws Exception {
checkSuggestionPerformance("uk", 700);
}
private Dictionary loadDictionary(String code) throws IOException, ParseException {
long start = System.nanoTime();
Path aff = findAffFile(code);

View File

@ -3,3 +3,4 @@ uART/XW-
bein/XW-
Stand/UX
UART/-
YouTube

View File

@ -0,0 +1,3 @@
YouTube
UART
UART