From f05adff4ca46358f56331aa21fa69796b61bd4ef Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Tue, 18 Jul 2023 21:25:26 +0200 Subject: [PATCH] hunspell: speed up the dictionary enumeration (#12447) * hunspell: speed up the dictionary enumeration cache each word's case and the lowercase form group the words by lengths to avoid even visiting entries with unneeded lengths --- lucene/CHANGES.txt | 2 + .../lucene/analysis/hunspell/Dictionary.java | 7 +- .../analysis/hunspell/FlyweightEntry.java | 31 +++ .../hunspell/GeneratingSuggester.java | 24 +-- .../hunspell/SuggestibleEntryCache.java | 178 +++++++++++++----- .../analysis/hunspell/TrigramAutomaton.java | 13 +- .../analysis/hunspell/WordFormGenerator.java | 6 +- .../lucene/analysis/hunspell/WordStorage.java | 101 ++++++---- .../analysis/hunspell/TestDictionary.java | 4 +- .../analysis/hunspell/TestPerformance.java | 10 + .../analysis/hunspell/germanManualCase.dic | 3 +- .../analysis/hunspell/germanManualCase.sug | 3 + .../analysis/hunspell/germanManualCase.wrong | 1 + 13 files changed, 265 insertions(+), 118 deletions(-) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlyweightEntry.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.sug diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 919f16ad255..2eed5e676cb 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -75,6 +75,8 @@ Improvements * LUCENE-10652: Add a top-n range faceting example to RangeFacetsExample. (Yuting Gan) +* GITHUB#12447: Hunspell: speed up the dictionary enumeration (Peter Gromov) + Optimizations --------------------- diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 820acddbc09..50c11159a0d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -1189,7 +1189,12 @@ public class Dictionary { // finalize last entry success = true; - return builder.build(); + return new WordStorage(builder) { + @Override + char caseFold(char c) { + return Dictionary.this.caseFold(c); + } + }; } finally { if (success) { tempDir.deleteFile(sorted); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlyweightEntry.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlyweightEntry.java new file mode 100644 index 00000000000..337a9b5c0c0 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlyweightEntry.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.IntsRef; + +/** A mutable entry object used when enumerating the dictionary internally */ +abstract class FlyweightEntry { + abstract boolean hasTitleCase(); + + abstract CharsRef root(); + + abstract CharSequence lowerCaseRoot(); + + abstract IntsRef forms(); +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java index 82d58080f9b..892abfdd606 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java @@ -28,9 +28,8 @@ import java.util.Objects; import java.util.PriorityQueue; import java.util.Set; import java.util.TreeSet; -import java.util.function.BiConsumer; +import java.util.function.Consumer; import java.util.function.IntPredicate; -import java.util.function.Supplier; import java.util.stream.Collectors; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.IntsRef; @@ -72,29 +71,22 @@ class GeneratingSuggester { IntPredicate isSuggestible = formId -> !flagLookup.hasAnyFlag(formId, excludeFlags); boolean ignoreTitleCaseRoots = originalCase == WordCase.LOWER && !dictionary.hasLanguage("de"); - TrigramAutomaton automaton = - new TrigramAutomaton(word) { - @Override - char transformChar(char c) { - return dictionary.caseFold(c); - } - }; + TrigramAutomaton automaton = new TrigramAutomaton(word); processSuggestibleWords( Math.max(1, word.length() - MAX_ROOT_LENGTH_DIFF), word.length() + MAX_ROOT_LENGTH_DIFF, - (rootChars, formSupplier) -> { - if (ignoreTitleCaseRoots - && Character.isUpperCase(rootChars.charAt(0)) - && WordCase.caseOf(rootChars) == WordCase.TITLE) { + (entry) -> { + if (ignoreTitleCaseRoots && entry.hasTitleCase()) { return; } - int sc = automaton.ngramScore(rootChars); + int sc = automaton.ngramScore(entry.lowerCaseRoot()); if (sc == 0) { return; // no common characters at all, don't suggest this root } + CharsRef rootChars = entry.root(); sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length); boolean overflow = roots.size() == MAX_ROOTS; @@ -105,7 +97,7 @@ class GeneratingSuggester { speller.checkCanceled.run(); String root = rootChars.toString(); - IntsRef forms = formSupplier.get(); + IntsRef forms = entry.forms(); for (int i = 0; i < forms.length; i++) { if (isSuggestible.test(forms.ints[forms.offset + i])) { roots.add(new Weighted<>(new Root<>(root, forms.ints[forms.offset + i]), sc)); @@ -125,7 +117,7 @@ class GeneratingSuggester { } private void processSuggestibleWords( - int minLength, int maxLength, BiConsumer> processor) { + int minLength, int maxLength, Consumer processor) { if (entryCache != null) { entryCache.processSuggestibleWords(minLength, maxLength, processor); } else { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SuggestibleEntryCache.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SuggestibleEntryCache.java index 0dd8ce91d1d..3896da433df 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SuggestibleEntryCache.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SuggestibleEntryCache.java @@ -16,8 +16,9 @@ */ package org.apache.lucene.analysis.hunspell; -import java.util.function.BiConsumer; -import java.util.function.Supplier; +import java.util.HashMap; +import java.util.Map; +import java.util.function.Consumer; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.IntsRef; @@ -28,74 +29,149 @@ import org.apache.lucene.util.IntsRef; * compression. */ class SuggestibleEntryCache { - private final short[] lengths; - private final char[] roots; - private final int[] formData; + private static final short LOWER_CASE = (short) WordCase.LOWER.ordinal(); + private static final short NEUTRAL_CASE = (short) WordCase.NEUTRAL.ordinal(); + private static final short TITLE_CASE = (short) WordCase.TITLE.ordinal(); - private SuggestibleEntryCache(short[] lengths, char[] roots, int[] formData) { - this.lengths = lengths; - this.roots = roots; - this.formData = formData; + private final Section[] sections; + + private SuggestibleEntryCache(Map builders) { + int maxLength = + builders.isEmpty() ? 0 : builders.keySet().stream().max(Integer::compare).orElseThrow(); + sections = new Section[maxLength + 1]; + for (int i = 0; i < sections.length; i++) { + SectionBuilder builder = builders.get(i); + sections[i] = builder == null ? null : builder.build(i); + } } static SuggestibleEntryCache buildCache(WordStorage storage) { var consumer = - new BiConsumer>() { - short[] lengths = new short[10]; - final StringBuilder roots = new StringBuilder(); - int[] formData = new int[10]; - int lenOffset = 0; - int formDataOffset = 0; + new Consumer() { + final Map builders = new HashMap<>(); @Override - public void accept(CharsRef root, Supplier formSupplier) { + public void accept(FlyweightEntry entry) { + CharsRef root = entry.root(); if (root.length > Short.MAX_VALUE) { throw new UnsupportedOperationException( "Too long dictionary entry, please report this to dev@lucene.apache.org"); } - IntsRef forms = formSupplier.get(); - - lengths = ArrayUtil.grow(lengths, lenOffset + 2); - lengths[lenOffset] = (short) root.length; - lengths[lenOffset + 1] = (short) forms.length; - lenOffset += 2; - - roots.append(root.chars, root.offset, root.length); - - formData = ArrayUtil.grow(formData, formDataOffset + forms.length); - System.arraycopy(forms.ints, forms.offset, formData, formDataOffset, forms.length); - formDataOffset += forms.length; + builders.computeIfAbsent(root.length, __ -> new SectionBuilder()).add(entry); } }; - storage.processSuggestibleWords(1, Integer.MAX_VALUE, consumer); - return new SuggestibleEntryCache( - ArrayUtil.copyOfSubArray(consumer.lengths, 0, consumer.lenOffset), - consumer.roots.toString().toCharArray(), - ArrayUtil.copyOfSubArray(consumer.formData, 0, consumer.formDataOffset)); + return new SuggestibleEntryCache(consumer.builders); } - void processSuggestibleWords( - int minLength, int maxLength, BiConsumer> processor) { - CharsRef chars = new CharsRef(roots, 0, 0); - IntsRef forms = new IntsRef(formData, 0, 0); - Supplier formSupplier = () -> forms; - int rootOffset = 0; - int formDataOffset = 0; - for (int i = 0; i < lengths.length; i += 2) { - int rootLength = lengths[i]; - short formDataLength = lengths[i + 1]; - if (rootLength >= minLength && rootLength <= maxLength) { - chars.offset = rootOffset; - chars.length = rootLength; - forms.offset = formDataOffset; - forms.length = formDataLength; - processor.accept(chars, formSupplier); + private static class SectionBuilder { + final StringBuilder roots = new StringBuilder(), lowRoots = new StringBuilder(); + short[] meta = new short[10]; + int[] formData = new int[10]; + int metaOffset, formDataOffset; + + void add(FlyweightEntry entry) { + CharsRef root = entry.root(); + if (root.length > Short.MAX_VALUE) { + throw new UnsupportedOperationException( + "Too long dictionary entry, please report this to dev@lucene.apache.org"); + } + + IntsRef forms = entry.forms(); + + short rootCase = (short) WordCase.caseOf(root).ordinal(); + + meta = ArrayUtil.grow(meta, metaOffset + 2); + meta[metaOffset] = (short) forms.length; + meta[metaOffset + 1] = rootCase; + metaOffset += 2; + + lowRoots.append(entry.lowerCaseRoot()); + if (hasUpperCase(rootCase)) { + roots.append(root.chars, root.offset, root.length); + } + + formData = ArrayUtil.grow(formData, formDataOffset + forms.length); + System.arraycopy(forms.ints, forms.offset, formData, formDataOffset, forms.length); + formDataOffset += forms.length; + } + + Section build(int rootLength) { + return new Section( + rootLength, + ArrayUtil.copyOfSubArray(meta, 0, metaOffset), + roots.toString().toCharArray(), + lowRoots.toString().toCharArray(), + ArrayUtil.copyOfSubArray(formData, 0, formDataOffset)); + } + } + + private static boolean hasUpperCase(short rootCase) { + return rootCase != LOWER_CASE && rootCase != NEUTRAL_CASE; + } + + void processSuggestibleWords(int minLength, int maxLength, Consumer processor) { + maxLength = Math.min(maxLength, sections.length - 1); + for (int i = Math.min(minLength, sections.length); i <= maxLength; i++) { + Section section = sections[i]; + if (section != null) { + section.processWords(processor); + } + } + } + + /** + * @param meta The lengths of the entry sub-arrays in formData plus the case information + * @param roots original roots if they're not all-lowercase + */ + private record Section( + int rootLength, short[] meta, char[] roots, char[] lowRoots, int[] formData) { + + void processWords(Consumer processor) { + CharsRef chars = new CharsRef(roots, 0, Math.min(rootLength, roots.length)); + CharsRef lowerChars = new CharsRef(lowRoots, 0, rootLength); + IntsRef forms = new IntsRef(formData, 0, 0); + + var entry = + new FlyweightEntry() { + short wordCase; + + @Override + CharsRef root() { + return hasUpperCase(wordCase) ? chars : lowerChars; + } + + @Override + boolean hasTitleCase() { + return wordCase == TITLE_CASE; + } + + @Override + CharSequence lowerCaseRoot() { + return lowerChars; + } + + @Override + IntsRef forms() { + return forms; + } + }; + + for (int i = 0; i < meta.length; i += 2) { + short formDataLength = meta[i]; + short wordCase = meta[i + 1]; + forms.length = formDataLength; + entry.wordCase = wordCase; + processor.accept(entry); + + lowerChars.offset += rootLength; + if (hasUpperCase(wordCase)) { + chars.offset += rootLength; + } + forms.offset += formDataLength; } - rootOffset += rootLength; - formDataOffset += formDataLength; } } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java index f4404e4bcf0..2e6091541d2 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java @@ -18,7 +18,6 @@ package org.apache.lucene.analysis.hunspell; import java.util.HashMap; import java.util.Map; -import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.CharacterRunAutomaton; @@ -78,7 +77,7 @@ class TrigramAutomaton { return state; } - int ngramScore(CharsRef s2) { + int ngramScore(CharSequence s2) { countedSubstrings.clear(); int score1 = 0, score2 = 0, score3 = 0; // scores for substrings of length 1, 2 and 3 @@ -86,9 +85,9 @@ class TrigramAutomaton { // states of running the automaton on substrings [i-1, i) and [i-2, i) int state1 = -1, state2 = -1; - int limit = s2.length + s2.offset; - for (int i = s2.offset; i < limit; i++) { - char c = transformChar(s2.chars[i]); + int limit = s2.length(); + for (int i = 0; i < limit; i++) { + char c = s2.charAt(i); if (c < minChar) { state1 = state2 = -1; continue; @@ -121,10 +120,6 @@ class TrigramAutomaton { return score; } - char transformChar(char c) { - return c; - } - private int substringScore(int state, FixedBitSet countedSubstrings) { if (countedSubstrings.getAndSet(state)) return 0; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java index 38ef1a9d310..2e0799f4caa 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java @@ -300,9 +300,9 @@ public class WordFormGenerator { 1, Integer.MAX_VALUE, false, - (root, lazyForms) -> { - String rootStr = root.toString(); - IntsRef forms = lazyForms.get(); + e -> { + String rootStr = e.root().toString(); + IntsRef forms = e.forms(); for (int i = 0; i < forms.length; i += dictionary.formStep()) { char[] encodedFlags = dictionary.flagLookup.getFlags(forms.ints[forms.offset + i]); if (shouldConsiderAtAll(encodedFlags)) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java index 83fdfdea47d..4dcbca3135e 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java @@ -19,8 +19,7 @@ package org.apache.lucene.analysis.hunspell; import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.function.BiConsumer; -import java.util.function.Supplier; +import java.util.function.Consumer; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.DataOutput; @@ -49,7 +48,7 @@ import org.apache.lucene.util.fst.IntSequenceOutputs; * The entries are stored in a contiguous byte array, identified by their offsets, using {@link * DataOutput#writeVInt} ()} VINT} format for compression. */ -class WordStorage { +abstract class WordStorage { private static final int OFFSET_BITS = 25; private static final int OFFSET_MASK = (1 << OFFSET_BITS) - 1; private static final int COLLISION_MASK = 0x40; @@ -91,12 +90,15 @@ class WordStorage { */ private final byte[] wordData; - private WordStorage( - int maxEntryLength, boolean hasCustomMorphData, int[] hashTable, byte[] wordData) { - this.maxEntryLength = maxEntryLength; - this.hasCustomMorphData = hasCustomMorphData; - this.hashTable = hashTable; - this.wordData = wordData; + WordStorage(Builder builder) throws IOException { + if (builder.hashTable.length > 0) { + assert !builder.group.isEmpty() : "WordStorage builder should be only used once"; + builder.flushGroup(); + } + this.maxEntryLength = builder.maxEntryLength; + this.hasCustomMorphData = builder.hasCustomMorphData; + this.hashTable = builder.hashTable.length == 0 ? new int[1] : builder.hashTable; + this.wordData = ArrayUtil.copyOfSubArray(builder.wordData, 0, builder.dataWriter.getPosition()); } IntsRef lookupWord(char[] word, int offset, int length) { @@ -157,22 +159,20 @@ class WordStorage { * or ONLYINCOMPOUND flags). Note that the callback arguments (word and forms) are reused, so they * can be modified in any way, but may not be saved for later by the processor */ - void processSuggestibleWords( - int minLength, int maxLength, BiConsumer> processor) { + void processSuggestibleWords(int minLength, int maxLength, Consumer processor) { processAllWords(minLength, maxLength, true, processor); } void processAllWords( - int minLength, - int maxLength, - boolean suggestibleOnly, - BiConsumer> processor) { + int minLength, int maxLength, boolean suggestibleOnly, Consumer processor) { assert minLength <= maxLength; maxLength = Math.min(maxEntryLength, maxLength); CharsRef chars = new CharsRef(maxLength); ByteArrayDataInput in = new ByteArrayDataInput(wordData); - var formSupplier = new LazyFormReader(in); + + var entry = new MyFlyweightEntry(chars, in); + for (int entryCode : hashTable) { int pos = entryCode & OFFSET_MASK; int mask = entryCode >>> OFFSET_BITS; @@ -195,7 +195,7 @@ class WordStorage { } if (mightMatch) { - formSupplier.dataPos = in.getPosition(); + entry.dataPos = in.getPosition(); while (prevPos != 0 && wordStart > 0) { in.setPosition(prevPos); chars.chars[--wordStart] = (char) in.readVInt(); @@ -205,7 +205,7 @@ class WordStorage { if (prevPos == 0) { chars.offset = wordStart; chars.length = maxLength - wordStart; - processor.accept(chars, formSupplier); + processor.accept(entry); } } @@ -422,30 +422,61 @@ class WordStorage { } return false; } - - WordStorage build() throws IOException { - if (hashTable.length > 0) { - assert !group.isEmpty() : "build() should be only called once"; - flushGroup(); - } - byte[] trimmedData = ArrayUtil.copyOfSubArray(wordData, 0, dataWriter.getPosition()); - int[] table = hashTable.length == 0 ? new int[1] : hashTable; - return new WordStorage(maxEntryLength, hasCustomMorphData, table, trimmedData); - } } - private class LazyFormReader implements Supplier { - int dataPos; - private final ByteArrayDataInput in; - private final IntsRef forms; + abstract char caseFold(char c); - LazyFormReader(ByteArrayDataInput in) { + private class MyFlyweightEntry extends FlyweightEntry { + private final CharsRef chars; + private final ByteArrayDataInput in; + int dataPos; + private final IntsRef forms = new IntsRef(); + private final CharSequence lower; + + MyFlyweightEntry(CharsRef chars, ByteArrayDataInput in) { + this.chars = chars; this.in = in; - forms = new IntsRef(); + lower = + new CharSequence() { + @Override + public int length() { + return chars.length; + } + + @Override + public char charAt(int index) { + return caseFold(chars.chars[index + chars.offset]); + } + + @Override + public CharSequence subSequence(int start, int end) { + throw new UnsupportedOperationException(); + } + + @Override + public String toString() { + throw new UnsupportedOperationException(); + } + }; } @Override - public IntsRef get() { + boolean hasTitleCase() { + return Character.isUpperCase(chars.charAt(0)) && WordCase.caseOf(chars) == WordCase.TITLE; + } + + @Override + CharsRef root() { + return chars; + } + + @Override + CharSequence lowerCaseRoot() { + return lower; + } + + @Override + IntsRef forms() { in.setPosition(dataPos); int entryCount = in.readVInt() / (hasCustomMorphData ? 2 : 1); if (forms.ints.length < entryCount) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java index 81d8f55d788..a7335a0c3f5 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java @@ -109,11 +109,11 @@ public class TestDictionary extends LuceneTestCase { Dictionary dictionary, int minLength, int maxLength) { Set processed = new HashSet<>(); dictionary.words.processSuggestibleWords( - minLength, maxLength, (word, __) -> processed.add(word.toString())); + minLength, maxLength, e -> processed.add(e.root().toString())); Set cached = new HashSet<>(); SuggestibleEntryCache.buildCache(dictionary.words) - .processSuggestibleWords(minLength, maxLength, (word, __) -> cached.add(word.toString())); + .processSuggestibleWords(minLength, maxLength, e -> cached.add(e.root().toString())); assertEquals(processed, cached); return processed; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java index 8344f8dd968..232f12e13c1 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java @@ -99,6 +99,16 @@ public class TestPerformance extends LuceneTestCase { checkSuggestionPerformance("fr", 1_000); } + @Test + public void uk() throws Exception { + checkAnalysisPerformance("uk", 200_000); + } + + @Test + public void uk_suggest() throws Exception { + checkSuggestionPerformance("uk", 700); + } + private Dictionary loadDictionary(String code) throws IOException, ParseException { long start = System.nanoTime(); Path aff = findAffFile(code); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.dic index 5e075003c9a..efc32df832d 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.dic +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.dic @@ -2,4 +2,5 @@ uART/XW- bein/XW- Stand/UX -UART/- \ No newline at end of file +UART/- +YouTube \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.sug new file mode 100644 index 00000000000..315354ab1ee --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.sug @@ -0,0 +1,3 @@ +YouTube +UART +UART diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.wrong index c3ce031400c..b84fe360ff4 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.wrong +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.wrong @@ -1,3 +1,4 @@ +You StandUart uART Uart