From 422c89baefda8e8e8a9a3f9b761785895b5e9bb5 Mon Sep 17 00:00:00 2001
From: Peter Gromov <peter@jetbrains.com>
Date: Tue, 19 Jan 2021 09:32:23 +0100
Subject: [PATCH] LUCENE-9676: Hunspell: improve stemming of all-caps words
 (#2217)

Hunspell: improve stemming of all-caps words

Repeat Hunspell's logic:
* when encountering a mixed- or (inflectable) all-case dictionary entry, add its title-case analog as a hidden entry
* use that hidden entry for stemming case variants for title- and uppercase words, but don't consider it a valid word itself
* ...unless there's another explicit dictionary entry of that title case
---
 lucene/CHANGES.txt                            |   3 +-
 .../lucene/analysis/hunspell/Dictionary.java  | 212 +++++++++++-------
 .../lucene/analysis/hunspell/Stemmer.java     |  92 +++-----
 .../lucene/analysis/hunspell/WordCase.java    |  61 +++++
 .../lucene/analysis/hunspell/TestAllCaps.java |  42 ++++
 .../lucene/analysis/hunspell/TestEscaped.java |   2 +-
 .../lucene/analysis/hunspell/allcaps.aff      |   5 +
 .../lucene/analysis/hunspell/allcaps.dic      |   3 +
 .../lucene/analysis/hunspell/escaped.aff      |   1 +
 9 files changed, 283 insertions(+), 138 deletions(-)
 create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.aff
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.dic
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index f21e0829269..9dec819134b 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -84,7 +84,8 @@ API Changes
 
 Improvements
 
-* LUCENE-9665: Hunspell: support default encoding (Peter Gromov)
+* LUCENE-9665 LUCENE-9676 Hunspell improvements: support default encoding, improve stemming of all-caps words
+  (Peter Gromov)
 
 * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
   (Dawid Weiss)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 81b4beecf4e..34edb73c1a1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -73,6 +73,8 @@ public class Dictionary {
 
   static final char[] NOFLAGS = new char[0];
 
+  private static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell
+
   private static final String ALIAS_KEY = "AF";
   private static final String MORPH_ALIAS_KEY = "AM";
   private static final String PREFIX_KEY = "PFX";
@@ -238,10 +240,9 @@ public class Dictionary {
       readAffixFile(aff2, decoder);
 
       // read dictionary entries
-      IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
-      FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, o);
-      readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, fstCompiler);
-      words = fstCompiler.compile();
+      IndexOutput unsorted = mergeDictionaries(tempDir, tempFileNamePrefix, dictionaries, decoder);
+      String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
+      words = readSortedDictionaries(tempDir, sortedFile);
       aliases = null; // no longer needed
       morphAliases = null; // no longer needed
       success = true;
@@ -791,25 +792,13 @@ public class Dictionary {
     }
   }
 
-  /**
-   * Reads the dictionary file through the provided InputStreams, building up the words map
-   *
-   * @param dictionaries InputStreams to read the dictionary file through
-   * @param decoder CharsetDecoder used to decode the contents of the file
-   * @throws IOException Can be thrown while reading from the file
-   */
-  private void readDictionaryFiles(
+  private IndexOutput mergeDictionaries(
       Directory tempDir,
       String tempFileNamePrefix,
       List<InputStream> dictionaries,
-      CharsetDecoder decoder,
-      FSTCompiler<IntsRef> words)
+      CharsetDecoder decoder)
       throws IOException {
-    BytesRefBuilder flagsScratch = new BytesRefBuilder();
-    IntsRefBuilder scratchInts = new IntsRefBuilder();
-
     StringBuilder sb = new StringBuilder();
-
     IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
     try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
       for (InputStream dictionary : dictionaries) {
@@ -833,32 +822,58 @@ public class Dictionary {
               hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
             }
           }
-          if (needsInputCleaning) {
-            int flagSep = line.indexOf(FLAG_SEPARATOR);
-            if (flagSep == -1) {
-              flagSep = line.indexOf(MORPH_SEPARATOR);
-            }
-            if (flagSep == -1) {
-              CharSequence cleansed = cleanInput(line, sb);
-              writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
-            } else {
-              String text = line.substring(0, flagSep);
-              CharSequence cleansed = cleanInput(text, sb);
-              if (cleansed != sb) {
-                sb.setLength(0);
-                sb.append(cleansed);
-              }
-              sb.append(line.substring(flagSep));
-              writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
-            }
-          } else {
-            writer.write(line.getBytes(StandardCharsets.UTF_8));
-          }
+
+          writeNormalizedWordEntry(sb, writer, line);
         }
       }
       CodecUtil.writeFooter(unsorted);
     }
+    return unsorted;
+  }
 
+  private void writeNormalizedWordEntry(
+      StringBuilder reuse, ByteSequencesWriter writer, String line) throws IOException {
+    int flagSep = line.indexOf(FLAG_SEPARATOR);
+    int morphSep = line.indexOf(MORPH_SEPARATOR);
+    assert morphSep > 0;
+    assert morphSep > flagSep;
+    int sep = flagSep < 0 ? morphSep : flagSep;
+
+    CharSequence toWrite;
+    if (needsInputCleaning) {
+      cleanInput(line, sep, reuse);
+      reuse.append(line, sep, line.length());
+      toWrite = reuse;
+    } else {
+      toWrite = line;
+    }
+
+    String written = toWrite.toString();
+    sep = written.length() - (line.length() - sep);
+    writer.write(written.getBytes(StandardCharsets.UTF_8));
+
+    WordCase wordCase = WordCase.caseOf(written, sep);
+    if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
+      addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep));
+    }
+  }
+
+  private void addHiddenCapitalizedWord(
+      StringBuilder reuse, ByteSequencesWriter writer, String word, String afterSep)
+      throws IOException {
+    reuse.setLength(0);
+    reuse.append(Character.toUpperCase(word.charAt(0)));
+    for (int i = 1; i < word.length(); i++) {
+      reuse.append(caseFold(word.charAt(i)));
+    }
+    reuse.append(FLAG_SEPARATOR);
+    reuse.append(HIDDEN_FLAG);
+    reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
+    writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
+  }
+
+  private String sortWordsOffline(
+      Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
     OfflineSorter sorter =
         new OfflineSorter(
             tempDir,
@@ -908,8 +923,13 @@ public class Dictionary {
         IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
       }
     }
+    return sorted;
+  }
 
-    boolean success2 = false;
+  private FST<IntsRef> readSortedDictionaries(Directory tempDir, String sorted) throws IOException {
+    boolean success = false;
+
+    EntryGrouper grouper = new EntryGrouper();
 
     try (ByteSequencesReader reader =
         new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
@@ -917,9 +937,6 @@ public class Dictionary {
       // TODO: the flags themselves can be double-chars (long) or also numeric
       // either way the trick is to encode them as char... but they must be parsed differently
 
-      String currentEntry = null;
-      IntsRefBuilder currentOrds = new IntsRefBuilder();
-
       while (true) {
         BytesRef scratch = reader.next();
         if (scratch == null) {
@@ -959,42 +976,15 @@ public class Dictionary {
           }
         }
 
-        int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
-        if (cmp < 0) {
-          throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
-        } else {
-          encodeFlags(flagsScratch, wordForm);
-          int ord = flagLookup.add(flagsScratch.get());
-          if (ord < 0) {
-            // already exists in our hash
-            ord = (-ord) - 1;
-          }
-          // finalize current entry, and switch "current" if necessary
-          if (cmp > 0 && currentEntry != null) {
-            Util.toUTF32(currentEntry, scratchInts);
-            words.add(scratchInts.get(), currentOrds.get());
-          }
-          // swap current
-          if (cmp > 0) {
-            currentEntry = entry;
-            currentOrds = new IntsRefBuilder(); // must be this way
-          }
-          if (hasStemExceptions) {
-            currentOrds.append(ord);
-            currentOrds.append(stemExceptionID);
-          } else {
-            currentOrds.append(ord);
-          }
-        }
+        grouper.add(entry, wordForm, stemExceptionID);
       }
 
       // finalize last entry
-      assert currentEntry != null;
-      Util.toUTF32(currentEntry, scratchInts);
-      words.add(scratchInts.get(), currentOrds.get());
-      success2 = true;
+      grouper.flushGroup();
+      success = true;
+      return grouper.words.compile();
     } finally {
-      if (success2) {
+      if (success) {
         tempDir.deleteFile(sorted);
       } else {
         IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
@@ -1002,6 +992,72 @@ public class Dictionary {
     }
   }
 
+  private class EntryGrouper {
+    final FSTCompiler<IntsRef> words =
+        new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, IntSequenceOutputs.getSingleton());
+    private final List<char[]> group = new ArrayList<>();
+    private final List<Integer> stemExceptionIDs = new ArrayList<>();
+    private final BytesRefBuilder flagsScratch = new BytesRefBuilder();
+    private final IntsRefBuilder scratchInts = new IntsRefBuilder();
+    private String currentEntry = null;
+
+    void add(String entry, char[] flags, int stemExceptionID) throws IOException {
+      if (!entry.equals(currentEntry)) {
+        if (currentEntry != null) {
+          if (entry.compareTo(currentEntry) < 0) {
+            throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
+          }
+          flushGroup();
+        }
+        currentEntry = entry;
+      }
+
+      group.add(flags);
+      if (hasStemExceptions) {
+        stemExceptionIDs.add(stemExceptionID);
+      }
+    }
+
+    void flushGroup() throws IOException {
+      IntsRefBuilder currentOrds = new IntsRefBuilder();
+
+      boolean hasNonHidden = false;
+      for (char[] flags : group) {
+        if (!hasHiddenFlag(flags)) {
+          hasNonHidden = true;
+          break;
+        }
+      }
+
+      for (int i = 0; i < group.size(); i++) {
+        char[] flags = group.get(i);
+        if (hasNonHidden && hasHiddenFlag(flags)) {
+          continue;
+        }
+
+        encodeFlags(flagsScratch, flags);
+        int ord = flagLookup.add(flagsScratch.get());
+        if (ord < 0) {
+          ord = -ord - 1; // already exists in our hash
+        }
+        currentOrds.append(ord);
+        if (hasStemExceptions) {
+          currentOrds.append(stemExceptionIDs.get(i));
+        }
+      }
+
+      Util.toUTF32(currentEntry, scratchInts);
+      words.add(scratchInts.get(), currentOrds.get());
+
+      group.clear();
+      stemExceptionIDs.clear();
+    }
+  }
+
+  static boolean hasHiddenFlag(char[] flags) {
+    return hasFlag(flags, HIDDEN_FLAG);
+  }
+
   static char[] decodeFlags(BytesRef b) {
     if (b.length == 0) {
       return CharsRef.EMPTY_CHARS;
@@ -1191,9 +1247,13 @@ public class Dictionary {
   }
 
   CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
+    return cleanInput(input, input.length(), reuse);
+  }
+
+  private CharSequence cleanInput(CharSequence input, int prefixLength, StringBuilder reuse) {
     reuse.setLength(0);
 
-    for (int i = 0; i < input.length(); i++) {
+    for (int i = 0; i < prefixLength; i++) {
       char ch = input.charAt(i);
 
       if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
index 7b50da26323..413e570f2a1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -95,57 +95,30 @@ final class Stemmer {
       word = scratchBuffer;
     }
 
-    int caseType = caseOf(word, length);
-    if (caseType == UPPER_CASE) {
-      // upper: union exact, title, lower
+    WordCase wordCase = caseOf(word, length);
+    List<CharsRef> list = doStem(word, length, false);
+    if (wordCase == WordCase.UPPER) {
       caseFoldTitle(word, length);
-      caseFoldLower(titleBuffer, length);
-      List<CharsRef> list = doStem(word, length, false);
       list.addAll(doStem(titleBuffer, length, true));
-      list.addAll(doStem(lowerBuffer, length, true));
-      return list;
-    } else if (caseType == TITLE_CASE) {
-      // title: union exact, lower
-      caseFoldLower(word, length);
-      List<CharsRef> list = doStem(word, length, false);
-      list.addAll(doStem(lowerBuffer, length, true));
-      return list;
-    } else {
-      // exact match only
-      return doStem(word, length, false);
     }
+    if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
+      caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
+      list.addAll(doStem(lowerBuffer, length, true));
+    }
+    return list;
   }
 
   // temporary buffers for case variants
   private char[] lowerBuffer = new char[8];
   private char[] titleBuffer = new char[8];
 
-  private static final int EXACT_CASE = 0;
-  private static final int TITLE_CASE = 1;
-  private static final int UPPER_CASE = 2;
-
   /** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
-  private int caseOf(char[] word, int length) {
+  private WordCase caseOf(char[] word, int length) {
     if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) {
-      return EXACT_CASE;
+      return WordCase.MIXED;
     }
 
-    // determine if we are title or lowercase (or something funky, in which it's exact)
-    boolean seenUpper = false;
-    boolean seenLower = false;
-    for (int i = 1; i < length; i++) {
-      boolean v = Character.isUpperCase(word[i]);
-      seenUpper |= v;
-      seenLower |= !v;
-    }
-
-    if (!seenLower) {
-      return UPPER_CASE;
-    } else if (!seenUpper) {
-      return TITLE_CASE;
-    } else {
-      return EXACT_CASE;
-    }
+    return WordCase.caseOf(word, length);
   }
 
   /** folds titlecase variant of word to titleBuffer */
@@ -169,25 +142,20 @@ final class Stemmer {
     IntsRef forms = dictionary.lookupWord(word, 0, length);
     if (forms != null) {
       for (int i = 0; i < forms.length; i += formStep) {
-        boolean checkKeepCase = caseVariant && dictionary.keepcase != -1;
-        boolean checkNeedAffix = dictionary.needaffix != -1;
-        boolean checkOnlyInCompound = dictionary.onlyincompound != -1;
-        if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) {
-          dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
-          char[] wordFlags = Dictionary.decodeFlags(scratch);
-          // we are looking for a case variant, but this word does not allow it
-          if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) {
-            continue;
-          }
-          // we can't add this form, it's a pseudostem requiring an affix
-          if (checkNeedAffix && Dictionary.hasFlag(wordFlags, (char) dictionary.needaffix)) {
-            continue;
-          }
-          // we can't add this form, it only belongs inside a compound word
-          if (checkOnlyInCompound
-              && Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
-            continue;
-          }
+        dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
+        char[] wordFlags = Dictionary.decodeFlags(scratch);
+        if (!acceptCase(caseVariant, wordFlags)) {
+          continue;
+        }
+        // we can't add this form, it's a pseudostem requiring an affix
+        if (dictionary.needaffix != -1
+            && Dictionary.hasFlag(wordFlags, (char) dictionary.needaffix)) {
+          continue;
+        }
+        // we can't add this form, it only belongs inside a compound word
+        if (dictionary.onlyincompound != -1
+            && Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
+          continue;
         }
         stems.add(newStem(word, length, forms, i));
       }
@@ -200,6 +168,12 @@ final class Stemmer {
     return stems;
   }
 
+  private boolean acceptCase(boolean caseVariant, char[] wordFlags) {
+    return caseVariant
+        ? dictionary.keepcase == -1 || !Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)
+        : !Dictionary.hasHiddenFlag(wordFlags);
+  }
+
   /**
    * Find the unique stem(s) of the provided word
    *
@@ -595,9 +569,7 @@ final class Stemmer {
           }
 
           // we are looking for a case variant, but this word does not allow it
-          if (caseVariant
-              && dictionary.keepcase != -1
-              && Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) {
+          if (!acceptCase(caseVariant, wordFlags)) {
             continue;
           }
           // we aren't decompounding (yet)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
new file mode 100644
index 00000000000..7d9e2e75873
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+enum WordCase {
+  UPPER,
+  TITLE,
+  LOWER,
+  MIXED;
+
+  static WordCase caseOf(char[] word, int length) {
+    boolean capitalized = Character.isUpperCase(word[0]);
+
+    boolean seenUpper = false;
+    boolean seenLower = false;
+    for (int i = 1; i < length; i++) {
+      char ch = word[i];
+      seenUpper = seenUpper || Character.isUpperCase(ch);
+      seenLower = seenLower || Character.isLowerCase(ch);
+      if (seenUpper && seenLower) break;
+    }
+
+    return get(capitalized, seenUpper, seenLower);
+  }
+
+  static WordCase caseOf(CharSequence word, int length) {
+    boolean capitalized = Character.isUpperCase(word.charAt(0));
+
+    boolean seenUpper = false;
+    boolean seenLower = false;
+    for (int i = 1; i < length; i++) {
+      char ch = word.charAt(i);
+      seenUpper = seenUpper || Character.isUpperCase(ch);
+      seenLower = seenLower || Character.isLowerCase(ch);
+      if (seenUpper && seenLower) break;
+    }
+
+    return get(capitalized, seenUpper, seenLower);
+  }
+
+  private static WordCase get(boolean capitalized, boolean seenUpper, boolean seenLower) {
+    if (capitalized) {
+      return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED;
+    }
+    return seenUpper ? MIXED : LOWER;
+  }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java
new file mode 100644
index 00000000000..43c67644b3e
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import org.junit.BeforeClass;
+
+public class TestAllCaps extends StemmerTestBase {
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    init("allcaps.aff", "allcaps.dic");
+  }
+
+  public void testGood() {
+    assertStemsTo("OpenOffice.org", "OpenOffice.org");
+    assertStemsTo("UNICEF's", "UNICEF");
+
+    // Hunspell returns these title-cased stems, so for consistency we do, too
+    assertStemsTo("OPENOFFICE.ORG", "Openoffice.org");
+    assertStemsTo("UNICEF'S", "Unicef");
+  }
+
+  public void testWrong() {
+    assertStemsTo("Openoffice.org");
+    assertStemsTo("Unicef");
+    assertStemsTo("Unicef's");
+  }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java
index a5e9fe1d0a0..3038385665f 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java
@@ -27,7 +27,7 @@ public class TestEscaped extends StemmerTestBase {
   public void testStemming() {
     assertStemsTo("works", "work");
     assertStemsTo("work", "work");
-    assertStemsTo("R2/D2", "R2/D2");
+    assertStemsTo("R2/D2", "R2/D2", "R2/d2");
     assertStemsTo("R2/D2s", "R2/D2");
     assertStemsTo("N/A", "N/A");
     assertStemsTo("N/As");
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.aff
new file mode 100644
index 00000000000..57e916bf537
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.aff
@@ -0,0 +1,5 @@
+# check uppercase forms of allcaps word + affix and words with mixed casing
+WORDCHARS '.
+
+SFX S N 1
+SFX S   0     's      .
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.dic
new file mode 100644
index 00000000000..7d3cdcc0469
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.dic
@@ -0,0 +1,3 @@
+2
+OpenOffice.org
+UNICEF/S
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.aff
index b42845175e2..a66ee3695f5 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.aff
@@ -1,4 +1,5 @@
 SET UTF-8
+WORDCHARS \/0123456789
 
 SFX A Y 1
 SFX A 0 s . +PLUR