hunspell: speed up the dictionary enumeration (#12447)

* hunspell: speed up the dictionary enumeration cache each word's case and the lowercase form group the words by lengths to avoid even visiting entries with unneeded lengths
2023-07-18 21:25:26 +02:00 · 2023-07-18 21:25:26 +02:00 · f05adff4ca
parent b4619d87ed
commit f05adff4ca
13 changed files with 265 additions and 118 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -75,6 +75,8 @@ Improvements

 * LUCENE-10652: Add a top-n range faceting example to RangeFacetsExample. (Yuting Gan)

+* GITHUB#12447: Hunspell: speed up the dictionary enumeration (Peter Gromov)
+
 Optimizations
 ---------------------

--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -1189,7 +1189,12 @@ public class Dictionary {

      // finalize last entry
      success = true;
-      return builder.build();
+      return new WordStorage(builder) {
+        @Override
+        char caseFold(char c) {
+          return Dictionary.this.caseFold(c);
+        }
+      };
    } finally {
      if (success) {
        tempDir.deleteFile(sorted);
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlyweightEntry.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlyweightEntry.java
@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.IntsRef;
+
+/** A mutable entry object used when enumerating the dictionary internally */
+abstract class FlyweightEntry {
+  abstract boolean hasTitleCase();
+
+  abstract CharsRef root();
+
+  abstract CharSequence lowerCaseRoot();
+
+  abstract IntsRef forms();
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@ -28,9 +28,8 @@ import java.util.Objects;
 import java.util.PriorityQueue;
 import java.util.Set;
 import java.util.TreeSet;
-import java.util.function.BiConsumer;
+import java.util.function.Consumer;
 import java.util.function.IntPredicate;
-import java.util.function.Supplier;
 import java.util.stream.Collectors;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
@ -72,29 +71,22 @@ class GeneratingSuggester {
    IntPredicate isSuggestible = formId -> !flagLookup.hasAnyFlag(formId, excludeFlags);

    boolean ignoreTitleCaseRoots = originalCase == WordCase.LOWER && !dictionary.hasLanguage("de");
-    TrigramAutomaton automaton =
-        new TrigramAutomaton(word) {
-          @Override
-          char transformChar(char c) {
-            return dictionary.caseFold(c);
-          }
-        };
+    TrigramAutomaton automaton = new TrigramAutomaton(word);

    processSuggestibleWords(
        Math.max(1, word.length() - MAX_ROOT_LENGTH_DIFF),
        word.length() + MAX_ROOT_LENGTH_DIFF,
-        (rootChars, formSupplier) -> {
-          if (ignoreTitleCaseRoots
-              && Character.isUpperCase(rootChars.charAt(0))
-              && WordCase.caseOf(rootChars) == WordCase.TITLE) {
+        (entry) -> {
+          if (ignoreTitleCaseRoots && entry.hasTitleCase()) {
            return;
          }

-          int sc = automaton.ngramScore(rootChars);
+          int sc = automaton.ngramScore(entry.lowerCaseRoot());
          if (sc == 0) {
            return; // no common characters at all, don't suggest this root
          }

+          CharsRef rootChars = entry.root();
          sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);

          boolean overflow = roots.size() == MAX_ROOTS;
@ -105,7 +97,7 @@ class GeneratingSuggester {
          speller.checkCanceled.run();

          String root = rootChars.toString();
-          IntsRef forms = formSupplier.get();
+          IntsRef forms = entry.forms();
          for (int i = 0; i < forms.length; i++) {
            if (isSuggestible.test(forms.ints[forms.offset + i])) {
              roots.add(new Weighted<>(new Root<>(root, forms.ints[forms.offset + i]), sc));
@ -125,7 +117,7 @@ class GeneratingSuggester {
  }

  private void processSuggestibleWords(
-      int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
+      int minLength, int maxLength, Consumer<FlyweightEntry> processor) {
    if (entryCache != null) {
      entryCache.processSuggestibleWords(minLength, maxLength, processor);
    } else {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SuggestibleEntryCache.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SuggestibleEntryCache.java
@ -16,8 +16,9 @@
 */
 package org.apache.lucene.analysis.hunspell;

-import java.util.function.BiConsumer;
-import java.util.function.Supplier;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.function.Consumer;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
@ -28,74 +29,149 @@ import org.apache.lucene.util.IntsRef;
 * compression.
 */
 class SuggestibleEntryCache {
-  private final short[] lengths;
-  private final char[] roots;
-  private final int[] formData;
+  private static final short LOWER_CASE = (short) WordCase.LOWER.ordinal();
+  private static final short NEUTRAL_CASE = (short) WordCase.NEUTRAL.ordinal();
+  private static final short TITLE_CASE = (short) WordCase.TITLE.ordinal();

-  private SuggestibleEntryCache(short[] lengths, char[] roots, int[] formData) {
-    this.lengths = lengths;
-    this.roots = roots;
-    this.formData = formData;
+  private final Section[] sections;
+
+  private SuggestibleEntryCache(Map<Integer, SectionBuilder> builders) {
+    int maxLength =
+        builders.isEmpty() ? 0 : builders.keySet().stream().max(Integer::compare).orElseThrow();
+    sections = new Section[maxLength + 1];
+    for (int i = 0; i < sections.length; i++) {
+      SectionBuilder builder = builders.get(i);
+      sections[i] = builder == null ? null : builder.build(i);
+    }
  }

  static SuggestibleEntryCache buildCache(WordStorage storage) {
    var consumer =
-        new BiConsumer<CharsRef, Supplier<IntsRef>>() {
-          short[] lengths = new short[10];
-          final StringBuilder roots = new StringBuilder();
-          int[] formData = new int[10];
-          int lenOffset = 0;
-          int formDataOffset = 0;
+        new Consumer<FlyweightEntry>() {
+          final Map<Integer, SectionBuilder> builders = new HashMap<>();

          @Override
-          public void accept(CharsRef root, Supplier<IntsRef> formSupplier) {
+          public void accept(FlyweightEntry entry) {
+            CharsRef root = entry.root();
            if (root.length > Short.MAX_VALUE) {
              throw new UnsupportedOperationException(
                  "Too long dictionary entry, please report this to dev@lucene.apache.org");
            }

-            IntsRef forms = formSupplier.get();
-
-            lengths = ArrayUtil.grow(lengths, lenOffset + 2);
-            lengths[lenOffset] = (short) root.length;
-            lengths[lenOffset + 1] = (short) forms.length;
-            lenOffset += 2;
-
-            roots.append(root.chars, root.offset, root.length);
-
-            formData = ArrayUtil.grow(formData, formDataOffset + forms.length);
-            System.arraycopy(forms.ints, forms.offset, formData, formDataOffset, forms.length);
-            formDataOffset += forms.length;
+            builders.computeIfAbsent(root.length, __ -> new SectionBuilder()).add(entry);
          }
        };
-
    storage.processSuggestibleWords(1, Integer.MAX_VALUE, consumer);

-    return new SuggestibleEntryCache(
-        ArrayUtil.copyOfSubArray(consumer.lengths, 0, consumer.lenOffset),
-        consumer.roots.toString().toCharArray(),
-        ArrayUtil.copyOfSubArray(consumer.formData, 0, consumer.formDataOffset));
+    return new SuggestibleEntryCache(consumer.builders);
  }

-  void processSuggestibleWords(
-      int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
-    CharsRef chars = new CharsRef(roots, 0, 0);
-    IntsRef forms = new IntsRef(formData, 0, 0);
-    Supplier<IntsRef> formSupplier = () -> forms;
-    int rootOffset = 0;
-    int formDataOffset = 0;
-    for (int i = 0; i < lengths.length; i += 2) {
-      int rootLength = lengths[i];
-      short formDataLength = lengths[i + 1];
-      if (rootLength >= minLength && rootLength <= maxLength) {
-        chars.offset = rootOffset;
-        chars.length = rootLength;
-        forms.offset = formDataOffset;
-        forms.length = formDataLength;
-        processor.accept(chars, formSupplier);
+  private static class SectionBuilder {
+    final StringBuilder roots = new StringBuilder(), lowRoots = new StringBuilder();
+    short[] meta = new short[10];
+    int[] formData = new int[10];
+    int metaOffset, formDataOffset;
+
+    void add(FlyweightEntry entry) {
+      CharsRef root = entry.root();
+      if (root.length > Short.MAX_VALUE) {
+        throw new UnsupportedOperationException(
+            "Too long dictionary entry, please report this to dev@lucene.apache.org");
+      }
+
+      IntsRef forms = entry.forms();
+
+      short rootCase = (short) WordCase.caseOf(root).ordinal();
+
+      meta = ArrayUtil.grow(meta, metaOffset + 2);
+      meta[metaOffset] = (short) forms.length;
+      meta[metaOffset + 1] = rootCase;
+      metaOffset += 2;
+
+      lowRoots.append(entry.lowerCaseRoot());
+      if (hasUpperCase(rootCase)) {
+        roots.append(root.chars, root.offset, root.length);
+      }
+
+      formData = ArrayUtil.grow(formData, formDataOffset + forms.length);
+      System.arraycopy(forms.ints, forms.offset, formData, formDataOffset, forms.length);
+      formDataOffset += forms.length;
+    }
+
+    Section build(int rootLength) {
+      return new Section(
+          rootLength,
+          ArrayUtil.copyOfSubArray(meta, 0, metaOffset),
+          roots.toString().toCharArray(),
+          lowRoots.toString().toCharArray(),
+          ArrayUtil.copyOfSubArray(formData, 0, formDataOffset));
+    }
+  }
+
+  private static boolean hasUpperCase(short rootCase) {
+    return rootCase != LOWER_CASE && rootCase != NEUTRAL_CASE;
+  }
+
+  void processSuggestibleWords(int minLength, int maxLength, Consumer<FlyweightEntry> processor) {
+    maxLength = Math.min(maxLength, sections.length - 1);
+    for (int i = Math.min(minLength, sections.length); i <= maxLength; i++) {
+      Section section = sections[i];
+      if (section != null) {
+        section.processWords(processor);
+      }
+    }
+  }
+
+  /**
+   * @param meta The lengths of the entry sub-arrays in formData plus the case information
+   * @param roots original roots if they're not all-lowercase
+   */
+  private record Section(
+      int rootLength, short[] meta, char[] roots, char[] lowRoots, int[] formData) {
+
+    void processWords(Consumer<FlyweightEntry> processor) {
+      CharsRef chars = new CharsRef(roots, 0, Math.min(rootLength, roots.length));
+      CharsRef lowerChars = new CharsRef(lowRoots, 0, rootLength);
+      IntsRef forms = new IntsRef(formData, 0, 0);
+
+      var entry =
+          new FlyweightEntry() {
+            short wordCase;
+
+            @Override
+            CharsRef root() {
+              return hasUpperCase(wordCase) ? chars : lowerChars;
+            }
+
+            @Override
+            boolean hasTitleCase() {
+              return wordCase == TITLE_CASE;
+            }
+
+            @Override
+            CharSequence lowerCaseRoot() {
+              return lowerChars;
+            }
+
+            @Override
+            IntsRef forms() {
+              return forms;
+            }
+          };
+
+      for (int i = 0; i < meta.length; i += 2) {
+        short formDataLength = meta[i];
+        short wordCase = meta[i + 1];
+        forms.length = formDataLength;
+        entry.wordCase = wordCase;
+        processor.accept(entry);
+
+        lowerChars.offset += rootLength;
+        if (hasUpperCase(wordCase)) {
+          chars.offset += rootLength;
+        }
+        forms.offset += formDataLength;
      }
-      rootOffset += rootLength;
-      formDataOffset += formDataLength;
    }
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.hunspell;

 import java.util.HashMap;
 import java.util.Map;
-import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
@ -78,7 +77,7 @@ class TrigramAutomaton {
    return state;
  }

-  int ngramScore(CharsRef s2) {
+  int ngramScore(CharSequence s2) {
    countedSubstrings.clear();

    int score1 = 0, score2 = 0, score3 = 0; // scores for substrings of length 1, 2 and 3
@ -86,9 +85,9 @@ class TrigramAutomaton {
    // states of running the automaton on substrings [i-1, i) and [i-2, i)
    int state1 = -1, state2 = -1;

-    int limit = s2.length + s2.offset;
-    for (int i = s2.offset; i < limit; i++) {
-      char c = transformChar(s2.chars[i]);
+    int limit = s2.length();
+    for (int i = 0; i < limit; i++) {
+      char c = s2.charAt(i);
      if (c < minChar) {
        state1 = state2 = -1;
        continue;
@ -121,10 +120,6 @@ class TrigramAutomaton {
    return score;
  }

-  char transformChar(char c) {
-    return c;
-  }
-
  private int substringScore(int state, FixedBitSet countedSubstrings) {
    if (countedSubstrings.getAndSet(state)) return 0;

--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java
@ -300,9 +300,9 @@ public class WordFormGenerator {
        1,
        Integer.MAX_VALUE,
        false,
-        (root, lazyForms) -> {
-          String rootStr = root.toString();
-          IntsRef forms = lazyForms.get();
+        e -> {
+          String rootStr = e.root().toString();
+          IntsRef forms = e.forms();
          for (int i = 0; i < forms.length; i += dictionary.formStep()) {
            char[] encodedFlags = dictionary.flagLookup.getFlags(forms.ints[forms.offset + i]);
            if (shouldConsiderAtAll(encodedFlags)) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
@ -19,8 +19,7 @@ package org.apache.lucene.analysis.hunspell;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.function.BiConsumer;
-import java.util.function.Supplier;
+import java.util.function.Consumer;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.ByteArrayDataOutput;
 import org.apache.lucene.store.DataOutput;
@ -49,7 +48,7 @@ import org.apache.lucene.util.fst.IntSequenceOutputs;
 * The entries are stored in a contiguous byte array, identified by their offsets, using {@link
 * DataOutput#writeVInt} ()} VINT} format for compression.
 */
-class WordStorage {
+abstract class WordStorage {
  private static final int OFFSET_BITS = 25;
  private static final int OFFSET_MASK = (1 << OFFSET_BITS) - 1;
  private static final int COLLISION_MASK = 0x40;
@ -91,12 +90,15 @@ class WordStorage {
   */
  private final byte[] wordData;

-  private WordStorage(
-      int maxEntryLength, boolean hasCustomMorphData, int[] hashTable, byte[] wordData) {
-    this.maxEntryLength = maxEntryLength;
-    this.hasCustomMorphData = hasCustomMorphData;
-    this.hashTable = hashTable;
-    this.wordData = wordData;
+  WordStorage(Builder builder) throws IOException {
+    if (builder.hashTable.length > 0) {
+      assert !builder.group.isEmpty() : "WordStorage builder should be only used once";
+      builder.flushGroup();
+    }
+    this.maxEntryLength = builder.maxEntryLength;
+    this.hasCustomMorphData = builder.hasCustomMorphData;
+    this.hashTable = builder.hashTable.length == 0 ? new int[1] : builder.hashTable;
+    this.wordData = ArrayUtil.copyOfSubArray(builder.wordData, 0, builder.dataWriter.getPosition());
  }

  IntsRef lookupWord(char[] word, int offset, int length) {
@ -157,22 +159,20 @@ class WordStorage {
   * or ONLYINCOMPOUND flags). Note that the callback arguments (word and forms) are reused, so they
   * can be modified in any way, but may not be saved for later by the processor
   */
-  void processSuggestibleWords(
-      int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
+  void processSuggestibleWords(int minLength, int maxLength, Consumer<FlyweightEntry> processor) {
    processAllWords(minLength, maxLength, true, processor);
  }

  void processAllWords(
-      int minLength,
-      int maxLength,
-      boolean suggestibleOnly,
-      BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
+      int minLength, int maxLength, boolean suggestibleOnly, Consumer<FlyweightEntry> processor) {
    assert minLength <= maxLength;
    maxLength = Math.min(maxEntryLength, maxLength);

    CharsRef chars = new CharsRef(maxLength);
    ByteArrayDataInput in = new ByteArrayDataInput(wordData);
-    var formSupplier = new LazyFormReader(in);
+
+    var entry = new MyFlyweightEntry(chars, in);
+
    for (int entryCode : hashTable) {
      int pos = entryCode & OFFSET_MASK;
      int mask = entryCode >>> OFFSET_BITS;
@ -195,7 +195,7 @@ class WordStorage {
        }

        if (mightMatch) {
-          formSupplier.dataPos = in.getPosition();
+          entry.dataPos = in.getPosition();
          while (prevPos != 0 && wordStart > 0) {
            in.setPosition(prevPos);
            chars.chars[--wordStart] = (char) in.readVInt();
@ -205,7 +205,7 @@ class WordStorage {
          if (prevPos == 0) {
            chars.offset = wordStart;
            chars.length = maxLength - wordStart;
-            processor.accept(chars, formSupplier);
+            processor.accept(entry);
          }
        }

@ -422,30 +422,61 @@ class WordStorage {
      }
      return false;
    }
-
-    WordStorage build() throws IOException {
-      if (hashTable.length > 0) {
-        assert !group.isEmpty() : "build() should be only called once";
-        flushGroup();
-      }
-      byte[] trimmedData = ArrayUtil.copyOfSubArray(wordData, 0, dataWriter.getPosition());
-      int[] table = hashTable.length == 0 ? new int[1] : hashTable;
-      return new WordStorage(maxEntryLength, hasCustomMorphData, table, trimmedData);
-    }
  }

-  private class LazyFormReader implements Supplier<IntsRef> {
-    int dataPos;
-    private final ByteArrayDataInput in;
-    private final IntsRef forms;
+  abstract char caseFold(char c);

-    LazyFormReader(ByteArrayDataInput in) {
+  private class MyFlyweightEntry extends FlyweightEntry {
+    private final CharsRef chars;
+    private final ByteArrayDataInput in;
+    int dataPos;
+    private final IntsRef forms = new IntsRef();
+    private final CharSequence lower;
+
+    MyFlyweightEntry(CharsRef chars, ByteArrayDataInput in) {
+      this.chars = chars;
      this.in = in;
-      forms = new IntsRef();
+      lower =
+          new CharSequence() {
+            @Override
+            public int length() {
+              return chars.length;
+            }
+
+            @Override
+            public char charAt(int index) {
+              return caseFold(chars.chars[index + chars.offset]);
+            }
+
+            @Override
+            public CharSequence subSequence(int start, int end) {
+              throw new UnsupportedOperationException();
+            }
+
+            @Override
+            public String toString() {
+              throw new UnsupportedOperationException();
+            }
+          };
    }

    @Override
-    public IntsRef get() {
+    boolean hasTitleCase() {
+      return Character.isUpperCase(chars.charAt(0)) && WordCase.caseOf(chars) == WordCase.TITLE;
+    }
+
+    @Override
+    CharsRef root() {
+      return chars;
+    }
+
+    @Override
+    CharSequence lowerCaseRoot() {
+      return lower;
+    }
+
+    @Override
+    IntsRef forms() {
      in.setPosition(dataPos);
      int entryCount = in.readVInt() / (hasCustomMorphData ? 2 : 1);
      if (forms.ints.length < entryCount) {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@ -109,11 +109,11 @@ public class TestDictionary extends LuceneTestCase {
      Dictionary dictionary, int minLength, int maxLength) {
    Set<String> processed = new HashSet<>();
    dictionary.words.processSuggestibleWords(
-        minLength, maxLength, (word, __) -> processed.add(word.toString()));
+        minLength, maxLength, e -> processed.add(e.root().toString()));

    Set<String> cached = new HashSet<>();
    SuggestibleEntryCache.buildCache(dictionary.words)
-        .processSuggestibleWords(minLength, maxLength, (word, __) -> cached.add(word.toString()));
+        .processSuggestibleWords(minLength, maxLength, e -> cached.add(e.root().toString()));
    assertEquals(processed, cached);

    return processed;
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@ -99,6 +99,16 @@ public class TestPerformance extends LuceneTestCase {
    checkSuggestionPerformance("fr", 1_000);
  }

+  @Test
+  public void uk() throws Exception {
+    checkAnalysisPerformance("uk", 200_000);
+  }
+
+  @Test
+  public void uk_suggest() throws Exception {
+    checkSuggestionPerformance("uk", 700);
+  }
+
  private Dictionary loadDictionary(String code) throws IOException, ParseException {
    long start = System.nanoTime();
    Path aff = findAffFile(code);
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.dic
@ -2,4 +2,5 @@
 uART/XW-
 bein/XW-
 Stand/UX
-UART/-
+UART/-
+YouTube
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.sug
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.sug
@ -0,0 +1,3 @@
+YouTube
+UART
+UART
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.wrong
@ -1,3 +1,4 @@
+You
 StandUart
 uART
 Uart