mirror of https://github.com/apache/lucene.git
hunspell: speed up the dictionary enumeration (#12447)
* hunspell: speed up the dictionary enumeration cache each word's case and the lowercase form group the words by lengths to avoid even visiting entries with unneeded lengths
This commit is contained in:
parent
b4619d87ed
commit
f05adff4ca
|
@ -75,6 +75,8 @@ Improvements
|
|||
|
||||
* LUCENE-10652: Add a top-n range faceting example to RangeFacetsExample. (Yuting Gan)
|
||||
|
||||
* GITHUB#12447: Hunspell: speed up the dictionary enumeration (Peter Gromov)
|
||||
|
||||
Optimizations
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -1189,7 +1189,12 @@ public class Dictionary {
|
|||
|
||||
// finalize last entry
|
||||
success = true;
|
||||
return builder.build();
|
||||
return new WordStorage(builder) {
|
||||
@Override
|
||||
char caseFold(char c) {
|
||||
return Dictionary.this.caseFold(c);
|
||||
}
|
||||
};
|
||||
} finally {
|
||||
if (success) {
|
||||
tempDir.deleteFile(sorted);
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
/** A mutable entry object used when enumerating the dictionary internally */
|
||||
abstract class FlyweightEntry {
|
||||
abstract boolean hasTitleCase();
|
||||
|
||||
abstract CharsRef root();
|
||||
|
||||
abstract CharSequence lowerCaseRoot();
|
||||
|
||||
abstract IntsRef forms();
|
||||
}
|
|
@ -28,9 +28,8 @@ import java.util.Objects;
|
|||
import java.util.PriorityQueue;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.IntPredicate;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
@ -72,29 +71,22 @@ class GeneratingSuggester {
|
|||
IntPredicate isSuggestible = formId -> !flagLookup.hasAnyFlag(formId, excludeFlags);
|
||||
|
||||
boolean ignoreTitleCaseRoots = originalCase == WordCase.LOWER && !dictionary.hasLanguage("de");
|
||||
TrigramAutomaton automaton =
|
||||
new TrigramAutomaton(word) {
|
||||
@Override
|
||||
char transformChar(char c) {
|
||||
return dictionary.caseFold(c);
|
||||
}
|
||||
};
|
||||
TrigramAutomaton automaton = new TrigramAutomaton(word);
|
||||
|
||||
processSuggestibleWords(
|
||||
Math.max(1, word.length() - MAX_ROOT_LENGTH_DIFF),
|
||||
word.length() + MAX_ROOT_LENGTH_DIFF,
|
||||
(rootChars, formSupplier) -> {
|
||||
if (ignoreTitleCaseRoots
|
||||
&& Character.isUpperCase(rootChars.charAt(0))
|
||||
&& WordCase.caseOf(rootChars) == WordCase.TITLE) {
|
||||
(entry) -> {
|
||||
if (ignoreTitleCaseRoots && entry.hasTitleCase()) {
|
||||
return;
|
||||
}
|
||||
|
||||
int sc = automaton.ngramScore(rootChars);
|
||||
int sc = automaton.ngramScore(entry.lowerCaseRoot());
|
||||
if (sc == 0) {
|
||||
return; // no common characters at all, don't suggest this root
|
||||
}
|
||||
|
||||
CharsRef rootChars = entry.root();
|
||||
sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);
|
||||
|
||||
boolean overflow = roots.size() == MAX_ROOTS;
|
||||
|
@ -105,7 +97,7 @@ class GeneratingSuggester {
|
|||
speller.checkCanceled.run();
|
||||
|
||||
String root = rootChars.toString();
|
||||
IntsRef forms = formSupplier.get();
|
||||
IntsRef forms = entry.forms();
|
||||
for (int i = 0; i < forms.length; i++) {
|
||||
if (isSuggestible.test(forms.ints[forms.offset + i])) {
|
||||
roots.add(new Weighted<>(new Root<>(root, forms.ints[forms.offset + i]), sc));
|
||||
|
@ -125,7 +117,7 @@ class GeneratingSuggester {
|
|||
}
|
||||
|
||||
private void processSuggestibleWords(
|
||||
int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
|
||||
int minLength, int maxLength, Consumer<FlyweightEntry> processor) {
|
||||
if (entryCache != null) {
|
||||
entryCache.processSuggestibleWords(minLength, maxLength, processor);
|
||||
} else {
|
||||
|
|
|
@ -16,8 +16,9 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.function.Consumer;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
@ -28,74 +29,149 @@ import org.apache.lucene.util.IntsRef;
|
|||
* compression.
|
||||
*/
|
||||
class SuggestibleEntryCache {
|
||||
private final short[] lengths;
|
||||
private final char[] roots;
|
||||
private final int[] formData;
|
||||
private static final short LOWER_CASE = (short) WordCase.LOWER.ordinal();
|
||||
private static final short NEUTRAL_CASE = (short) WordCase.NEUTRAL.ordinal();
|
||||
private static final short TITLE_CASE = (short) WordCase.TITLE.ordinal();
|
||||
|
||||
private SuggestibleEntryCache(short[] lengths, char[] roots, int[] formData) {
|
||||
this.lengths = lengths;
|
||||
this.roots = roots;
|
||||
this.formData = formData;
|
||||
private final Section[] sections;
|
||||
|
||||
private SuggestibleEntryCache(Map<Integer, SectionBuilder> builders) {
|
||||
int maxLength =
|
||||
builders.isEmpty() ? 0 : builders.keySet().stream().max(Integer::compare).orElseThrow();
|
||||
sections = new Section[maxLength + 1];
|
||||
for (int i = 0; i < sections.length; i++) {
|
||||
SectionBuilder builder = builders.get(i);
|
||||
sections[i] = builder == null ? null : builder.build(i);
|
||||
}
|
||||
}
|
||||
|
||||
static SuggestibleEntryCache buildCache(WordStorage storage) {
|
||||
var consumer =
|
||||
new BiConsumer<CharsRef, Supplier<IntsRef>>() {
|
||||
short[] lengths = new short[10];
|
||||
final StringBuilder roots = new StringBuilder();
|
||||
int[] formData = new int[10];
|
||||
int lenOffset = 0;
|
||||
int formDataOffset = 0;
|
||||
new Consumer<FlyweightEntry>() {
|
||||
final Map<Integer, SectionBuilder> builders = new HashMap<>();
|
||||
|
||||
@Override
|
||||
public void accept(CharsRef root, Supplier<IntsRef> formSupplier) {
|
||||
public void accept(FlyweightEntry entry) {
|
||||
CharsRef root = entry.root();
|
||||
if (root.length > Short.MAX_VALUE) {
|
||||
throw new UnsupportedOperationException(
|
||||
"Too long dictionary entry, please report this to dev@lucene.apache.org");
|
||||
}
|
||||
|
||||
IntsRef forms = formSupplier.get();
|
||||
|
||||
lengths = ArrayUtil.grow(lengths, lenOffset + 2);
|
||||
lengths[lenOffset] = (short) root.length;
|
||||
lengths[lenOffset + 1] = (short) forms.length;
|
||||
lenOffset += 2;
|
||||
|
||||
roots.append(root.chars, root.offset, root.length);
|
||||
|
||||
formData = ArrayUtil.grow(formData, formDataOffset + forms.length);
|
||||
System.arraycopy(forms.ints, forms.offset, formData, formDataOffset, forms.length);
|
||||
formDataOffset += forms.length;
|
||||
builders.computeIfAbsent(root.length, __ -> new SectionBuilder()).add(entry);
|
||||
}
|
||||
};
|
||||
|
||||
storage.processSuggestibleWords(1, Integer.MAX_VALUE, consumer);
|
||||
|
||||
return new SuggestibleEntryCache(
|
||||
ArrayUtil.copyOfSubArray(consumer.lengths, 0, consumer.lenOffset),
|
||||
consumer.roots.toString().toCharArray(),
|
||||
ArrayUtil.copyOfSubArray(consumer.formData, 0, consumer.formDataOffset));
|
||||
return new SuggestibleEntryCache(consumer.builders);
|
||||
}
|
||||
|
||||
void processSuggestibleWords(
|
||||
int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
|
||||
CharsRef chars = new CharsRef(roots, 0, 0);
|
||||
IntsRef forms = new IntsRef(formData, 0, 0);
|
||||
Supplier<IntsRef> formSupplier = () -> forms;
|
||||
int rootOffset = 0;
|
||||
int formDataOffset = 0;
|
||||
for (int i = 0; i < lengths.length; i += 2) {
|
||||
int rootLength = lengths[i];
|
||||
short formDataLength = lengths[i + 1];
|
||||
if (rootLength >= minLength && rootLength <= maxLength) {
|
||||
chars.offset = rootOffset;
|
||||
chars.length = rootLength;
|
||||
forms.offset = formDataOffset;
|
||||
forms.length = formDataLength;
|
||||
processor.accept(chars, formSupplier);
|
||||
private static class SectionBuilder {
|
||||
final StringBuilder roots = new StringBuilder(), lowRoots = new StringBuilder();
|
||||
short[] meta = new short[10];
|
||||
int[] formData = new int[10];
|
||||
int metaOffset, formDataOffset;
|
||||
|
||||
void add(FlyweightEntry entry) {
|
||||
CharsRef root = entry.root();
|
||||
if (root.length > Short.MAX_VALUE) {
|
||||
throw new UnsupportedOperationException(
|
||||
"Too long dictionary entry, please report this to dev@lucene.apache.org");
|
||||
}
|
||||
|
||||
IntsRef forms = entry.forms();
|
||||
|
||||
short rootCase = (short) WordCase.caseOf(root).ordinal();
|
||||
|
||||
meta = ArrayUtil.grow(meta, metaOffset + 2);
|
||||
meta[metaOffset] = (short) forms.length;
|
||||
meta[metaOffset + 1] = rootCase;
|
||||
metaOffset += 2;
|
||||
|
||||
lowRoots.append(entry.lowerCaseRoot());
|
||||
if (hasUpperCase(rootCase)) {
|
||||
roots.append(root.chars, root.offset, root.length);
|
||||
}
|
||||
|
||||
formData = ArrayUtil.grow(formData, formDataOffset + forms.length);
|
||||
System.arraycopy(forms.ints, forms.offset, formData, formDataOffset, forms.length);
|
||||
formDataOffset += forms.length;
|
||||
}
|
||||
|
||||
Section build(int rootLength) {
|
||||
return new Section(
|
||||
rootLength,
|
||||
ArrayUtil.copyOfSubArray(meta, 0, metaOffset),
|
||||
roots.toString().toCharArray(),
|
||||
lowRoots.toString().toCharArray(),
|
||||
ArrayUtil.copyOfSubArray(formData, 0, formDataOffset));
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean hasUpperCase(short rootCase) {
|
||||
return rootCase != LOWER_CASE && rootCase != NEUTRAL_CASE;
|
||||
}
|
||||
|
||||
void processSuggestibleWords(int minLength, int maxLength, Consumer<FlyweightEntry> processor) {
|
||||
maxLength = Math.min(maxLength, sections.length - 1);
|
||||
for (int i = Math.min(minLength, sections.length); i <= maxLength; i++) {
|
||||
Section section = sections[i];
|
||||
if (section != null) {
|
||||
section.processWords(processor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param meta The lengths of the entry sub-arrays in formData plus the case information
|
||||
* @param roots original roots if they're not all-lowercase
|
||||
*/
|
||||
private record Section(
|
||||
int rootLength, short[] meta, char[] roots, char[] lowRoots, int[] formData) {
|
||||
|
||||
void processWords(Consumer<FlyweightEntry> processor) {
|
||||
CharsRef chars = new CharsRef(roots, 0, Math.min(rootLength, roots.length));
|
||||
CharsRef lowerChars = new CharsRef(lowRoots, 0, rootLength);
|
||||
IntsRef forms = new IntsRef(formData, 0, 0);
|
||||
|
||||
var entry =
|
||||
new FlyweightEntry() {
|
||||
short wordCase;
|
||||
|
||||
@Override
|
||||
CharsRef root() {
|
||||
return hasUpperCase(wordCase) ? chars : lowerChars;
|
||||
}
|
||||
|
||||
@Override
|
||||
boolean hasTitleCase() {
|
||||
return wordCase == TITLE_CASE;
|
||||
}
|
||||
|
||||
@Override
|
||||
CharSequence lowerCaseRoot() {
|
||||
return lowerChars;
|
||||
}
|
||||
|
||||
@Override
|
||||
IntsRef forms() {
|
||||
return forms;
|
||||
}
|
||||
};
|
||||
|
||||
for (int i = 0; i < meta.length; i += 2) {
|
||||
short formDataLength = meta[i];
|
||||
short wordCase = meta[i + 1];
|
||||
forms.length = formDataLength;
|
||||
entry.wordCase = wordCase;
|
||||
processor.accept(entry);
|
||||
|
||||
lowerChars.offset += rootLength;
|
||||
if (hasUpperCase(wordCase)) {
|
||||
chars.offset += rootLength;
|
||||
}
|
||||
forms.offset += formDataLength;
|
||||
}
|
||||
rootOffset += rootLength;
|
||||
formDataOffset += formDataLength;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.hunspell;
|
|||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
|
@ -78,7 +77,7 @@ class TrigramAutomaton {
|
|||
return state;
|
||||
}
|
||||
|
||||
int ngramScore(CharsRef s2) {
|
||||
int ngramScore(CharSequence s2) {
|
||||
countedSubstrings.clear();
|
||||
|
||||
int score1 = 0, score2 = 0, score3 = 0; // scores for substrings of length 1, 2 and 3
|
||||
|
@ -86,9 +85,9 @@ class TrigramAutomaton {
|
|||
// states of running the automaton on substrings [i-1, i) and [i-2, i)
|
||||
int state1 = -1, state2 = -1;
|
||||
|
||||
int limit = s2.length + s2.offset;
|
||||
for (int i = s2.offset; i < limit; i++) {
|
||||
char c = transformChar(s2.chars[i]);
|
||||
int limit = s2.length();
|
||||
for (int i = 0; i < limit; i++) {
|
||||
char c = s2.charAt(i);
|
||||
if (c < minChar) {
|
||||
state1 = state2 = -1;
|
||||
continue;
|
||||
|
@ -121,10 +120,6 @@ class TrigramAutomaton {
|
|||
return score;
|
||||
}
|
||||
|
||||
char transformChar(char c) {
|
||||
return c;
|
||||
}
|
||||
|
||||
private int substringScore(int state, FixedBitSet countedSubstrings) {
|
||||
if (countedSubstrings.getAndSet(state)) return 0;
|
||||
|
||||
|
|
|
@ -300,9 +300,9 @@ public class WordFormGenerator {
|
|||
1,
|
||||
Integer.MAX_VALUE,
|
||||
false,
|
||||
(root, lazyForms) -> {
|
||||
String rootStr = root.toString();
|
||||
IntsRef forms = lazyForms.get();
|
||||
e -> {
|
||||
String rootStr = e.root().toString();
|
||||
IntsRef forms = e.forms();
|
||||
for (int i = 0; i < forms.length; i += dictionary.formStep()) {
|
||||
char[] encodedFlags = dictionary.flagLookup.getFlags(forms.ints[forms.offset + i]);
|
||||
if (shouldConsiderAtAll(encodedFlags)) {
|
||||
|
|
|
@ -19,8 +19,7 @@ package org.apache.lucene.analysis.hunspell;
|
|||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.function.Consumer;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
|
@ -49,7 +48,7 @@ import org.apache.lucene.util.fst.IntSequenceOutputs;
|
|||
* The entries are stored in a contiguous byte array, identified by their offsets, using {@link
|
||||
* DataOutput#writeVInt} ()} VINT} format for compression.
|
||||
*/
|
||||
class WordStorage {
|
||||
abstract class WordStorage {
|
||||
private static final int OFFSET_BITS = 25;
|
||||
private static final int OFFSET_MASK = (1 << OFFSET_BITS) - 1;
|
||||
private static final int COLLISION_MASK = 0x40;
|
||||
|
@ -91,12 +90,15 @@ class WordStorage {
|
|||
*/
|
||||
private final byte[] wordData;
|
||||
|
||||
private WordStorage(
|
||||
int maxEntryLength, boolean hasCustomMorphData, int[] hashTable, byte[] wordData) {
|
||||
this.maxEntryLength = maxEntryLength;
|
||||
this.hasCustomMorphData = hasCustomMorphData;
|
||||
this.hashTable = hashTable;
|
||||
this.wordData = wordData;
|
||||
WordStorage(Builder builder) throws IOException {
|
||||
if (builder.hashTable.length > 0) {
|
||||
assert !builder.group.isEmpty() : "WordStorage builder should be only used once";
|
||||
builder.flushGroup();
|
||||
}
|
||||
this.maxEntryLength = builder.maxEntryLength;
|
||||
this.hasCustomMorphData = builder.hasCustomMorphData;
|
||||
this.hashTable = builder.hashTable.length == 0 ? new int[1] : builder.hashTable;
|
||||
this.wordData = ArrayUtil.copyOfSubArray(builder.wordData, 0, builder.dataWriter.getPosition());
|
||||
}
|
||||
|
||||
IntsRef lookupWord(char[] word, int offset, int length) {
|
||||
|
@ -157,22 +159,20 @@ class WordStorage {
|
|||
* or ONLYINCOMPOUND flags). Note that the callback arguments (word and forms) are reused, so they
|
||||
* can be modified in any way, but may not be saved for later by the processor
|
||||
*/
|
||||
void processSuggestibleWords(
|
||||
int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
|
||||
void processSuggestibleWords(int minLength, int maxLength, Consumer<FlyweightEntry> processor) {
|
||||
processAllWords(minLength, maxLength, true, processor);
|
||||
}
|
||||
|
||||
void processAllWords(
|
||||
int minLength,
|
||||
int maxLength,
|
||||
boolean suggestibleOnly,
|
||||
BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
|
||||
int minLength, int maxLength, boolean suggestibleOnly, Consumer<FlyweightEntry> processor) {
|
||||
assert minLength <= maxLength;
|
||||
maxLength = Math.min(maxEntryLength, maxLength);
|
||||
|
||||
CharsRef chars = new CharsRef(maxLength);
|
||||
ByteArrayDataInput in = new ByteArrayDataInput(wordData);
|
||||
var formSupplier = new LazyFormReader(in);
|
||||
|
||||
var entry = new MyFlyweightEntry(chars, in);
|
||||
|
||||
for (int entryCode : hashTable) {
|
||||
int pos = entryCode & OFFSET_MASK;
|
||||
int mask = entryCode >>> OFFSET_BITS;
|
||||
|
@ -195,7 +195,7 @@ class WordStorage {
|
|||
}
|
||||
|
||||
if (mightMatch) {
|
||||
formSupplier.dataPos = in.getPosition();
|
||||
entry.dataPos = in.getPosition();
|
||||
while (prevPos != 0 && wordStart > 0) {
|
||||
in.setPosition(prevPos);
|
||||
chars.chars[--wordStart] = (char) in.readVInt();
|
||||
|
@ -205,7 +205,7 @@ class WordStorage {
|
|||
if (prevPos == 0) {
|
||||
chars.offset = wordStart;
|
||||
chars.length = maxLength - wordStart;
|
||||
processor.accept(chars, formSupplier);
|
||||
processor.accept(entry);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -422,30 +422,61 @@ class WordStorage {
|
|||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
WordStorage build() throws IOException {
|
||||
if (hashTable.length > 0) {
|
||||
assert !group.isEmpty() : "build() should be only called once";
|
||||
flushGroup();
|
||||
}
|
||||
byte[] trimmedData = ArrayUtil.copyOfSubArray(wordData, 0, dataWriter.getPosition());
|
||||
int[] table = hashTable.length == 0 ? new int[1] : hashTable;
|
||||
return new WordStorage(maxEntryLength, hasCustomMorphData, table, trimmedData);
|
||||
}
|
||||
}
|
||||
|
||||
private class LazyFormReader implements Supplier<IntsRef> {
|
||||
int dataPos;
|
||||
private final ByteArrayDataInput in;
|
||||
private final IntsRef forms;
|
||||
abstract char caseFold(char c);
|
||||
|
||||
LazyFormReader(ByteArrayDataInput in) {
|
||||
private class MyFlyweightEntry extends FlyweightEntry {
|
||||
private final CharsRef chars;
|
||||
private final ByteArrayDataInput in;
|
||||
int dataPos;
|
||||
private final IntsRef forms = new IntsRef();
|
||||
private final CharSequence lower;
|
||||
|
||||
MyFlyweightEntry(CharsRef chars, ByteArrayDataInput in) {
|
||||
this.chars = chars;
|
||||
this.in = in;
|
||||
forms = new IntsRef();
|
||||
lower =
|
||||
new CharSequence() {
|
||||
@Override
|
||||
public int length() {
|
||||
return chars.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public char charAt(int index) {
|
||||
return caseFold(chars.chars[index + chars.offset]);
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharSequence subSequence(int start, int end) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntsRef get() {
|
||||
boolean hasTitleCase() {
|
||||
return Character.isUpperCase(chars.charAt(0)) && WordCase.caseOf(chars) == WordCase.TITLE;
|
||||
}
|
||||
|
||||
@Override
|
||||
CharsRef root() {
|
||||
return chars;
|
||||
}
|
||||
|
||||
@Override
|
||||
CharSequence lowerCaseRoot() {
|
||||
return lower;
|
||||
}
|
||||
|
||||
@Override
|
||||
IntsRef forms() {
|
||||
in.setPosition(dataPos);
|
||||
int entryCount = in.readVInt() / (hasCustomMorphData ? 2 : 1);
|
||||
if (forms.ints.length < entryCount) {
|
||||
|
|
|
@ -109,11 +109,11 @@ public class TestDictionary extends LuceneTestCase {
|
|||
Dictionary dictionary, int minLength, int maxLength) {
|
||||
Set<String> processed = new HashSet<>();
|
||||
dictionary.words.processSuggestibleWords(
|
||||
minLength, maxLength, (word, __) -> processed.add(word.toString()));
|
||||
minLength, maxLength, e -> processed.add(e.root().toString()));
|
||||
|
||||
Set<String> cached = new HashSet<>();
|
||||
SuggestibleEntryCache.buildCache(dictionary.words)
|
||||
.processSuggestibleWords(minLength, maxLength, (word, __) -> cached.add(word.toString()));
|
||||
.processSuggestibleWords(minLength, maxLength, e -> cached.add(e.root().toString()));
|
||||
assertEquals(processed, cached);
|
||||
|
||||
return processed;
|
||||
|
|
|
@ -99,6 +99,16 @@ public class TestPerformance extends LuceneTestCase {
|
|||
checkSuggestionPerformance("fr", 1_000);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void uk() throws Exception {
|
||||
checkAnalysisPerformance("uk", 200_000);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void uk_suggest() throws Exception {
|
||||
checkSuggestionPerformance("uk", 700);
|
||||
}
|
||||
|
||||
private Dictionary loadDictionary(String code) throws IOException, ParseException {
|
||||
long start = System.nanoTime();
|
||||
Path aff = findAffFile(code);
|
||||
|
|
|
@ -2,4 +2,5 @@
|
|||
uART/XW-
|
||||
bein/XW-
|
||||
Stand/UX
|
||||
UART/-
|
||||
UART/-
|
||||
YouTube
|
|
@ -0,0 +1,3 @@
|
|||
YouTube
|
||||
UART
|
||||
UART
|
|
@ -1,3 +1,4 @@
|
|||
You
|
||||
StandUart
|
||||
uART
|
||||
Uart
|
||||
|
|
Loading…
Reference in New Issue