LUCENE-9676: Hunspell: improve stemming of all-caps words (#2217)

Hunspell: improve stemming of all-caps words Repeat Hunspell's logic: * when encountering a mixed- or (inflectable) all-case dictionary entry, add its title-case analog as a hidden entry * use that hidden entry for stemming case variants for title- and uppercase words, but don't consider it a valid word itself * ...unless there's another explicit dictionary entry of that title case
2021-01-19 09:32:23 +01:00 · 2021-01-19 09:32:23 +01:00 · 422c89baef
parent c1ae6dc07c
commit 422c89baef
9 changed files with 283 additions and 138 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -84,7 +84,8 @@ API Changes

 Improvements

-* LUCENE-9665: Hunspell: support default encoding (Peter Gromov)
+* LUCENE-9665 LUCENE-9676 Hunspell improvements: support default encoding, improve stemming of all-caps words
+  (Peter Gromov)

 * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
  (Dawid Weiss)
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -73,6 +73,8 @@ public class Dictionary {

  static final char[] NOFLAGS = new char[0];

+  private static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell
+
  private static final String ALIAS_KEY = "AF";
  private static final String MORPH_ALIAS_KEY = "AM";
  private static final String PREFIX_KEY = "PFX";
@ -238,10 +240,9 @@ public class Dictionary {
      readAffixFile(aff2, decoder);

      // read dictionary entries
-      IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
-      FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, o);
-      readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, fstCompiler);
-      words = fstCompiler.compile();
+      IndexOutput unsorted = mergeDictionaries(tempDir, tempFileNamePrefix, dictionaries, decoder);
+      String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
+      words = readSortedDictionaries(tempDir, sortedFile);
      aliases = null; // no longer needed
      morphAliases = null; // no longer needed
      success = true;
@ -791,25 +792,13 @@ public class Dictionary {
    }
  }

-  /**
-   * Reads the dictionary file through the provided InputStreams, building up the words map
-   *
-   * @param dictionaries InputStreams to read the dictionary file through
-   * @param decoder CharsetDecoder used to decode the contents of the file
-   * @throws IOException Can be thrown while reading from the file
-   */
-  private void readDictionaryFiles(
+  private IndexOutput mergeDictionaries(
      Directory tempDir,
      String tempFileNamePrefix,
      List<InputStream> dictionaries,
-      CharsetDecoder decoder,
-      FSTCompiler<IntsRef> words)
+      CharsetDecoder decoder)
      throws IOException {
-    BytesRefBuilder flagsScratch = new BytesRefBuilder();
-    IntsRefBuilder scratchInts = new IntsRefBuilder();
-
    StringBuilder sb = new StringBuilder();
-
    IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
    try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
      for (InputStream dictionary : dictionaries) {
@ -833,32 +822,58 @@ public class Dictionary {
              hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
            }
          }
-          if (needsInputCleaning) {
-            int flagSep = line.indexOf(FLAG_SEPARATOR);
-            if (flagSep == -1) {
-              flagSep = line.indexOf(MORPH_SEPARATOR);
-            }
-            if (flagSep == -1) {
-              CharSequence cleansed = cleanInput(line, sb);
-              writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
-            } else {
-              String text = line.substring(0, flagSep);
-              CharSequence cleansed = cleanInput(text, sb);
-              if (cleansed != sb) {
-                sb.setLength(0);
-                sb.append(cleansed);
-              }
-              sb.append(line.substring(flagSep));
-              writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
-            }
-          } else {
-            writer.write(line.getBytes(StandardCharsets.UTF_8));
-          }
+
+          writeNormalizedWordEntry(sb, writer, line);
        }
      }
      CodecUtil.writeFooter(unsorted);
    }
+    return unsorted;
+  }

+  private void writeNormalizedWordEntry(
+      StringBuilder reuse, ByteSequencesWriter writer, String line) throws IOException {
+    int flagSep = line.indexOf(FLAG_SEPARATOR);
+    int morphSep = line.indexOf(MORPH_SEPARATOR);
+    assert morphSep > 0;
+    assert morphSep > flagSep;
+    int sep = flagSep < 0 ? morphSep : flagSep;
+
+    CharSequence toWrite;
+    if (needsInputCleaning) {
+      cleanInput(line, sep, reuse);
+      reuse.append(line, sep, line.length());
+      toWrite = reuse;
+    } else {
+      toWrite = line;
+    }
+
+    String written = toWrite.toString();
+    sep = written.length() - (line.length() - sep);
+    writer.write(written.getBytes(StandardCharsets.UTF_8));
+
+    WordCase wordCase = WordCase.caseOf(written, sep);
+    if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
+      addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep));
+    }
+  }
+
+  private void addHiddenCapitalizedWord(
+      StringBuilder reuse, ByteSequencesWriter writer, String word, String afterSep)
+      throws IOException {
+    reuse.setLength(0);
+    reuse.append(Character.toUpperCase(word.charAt(0)));
+    for (int i = 1; i < word.length(); i++) {
+      reuse.append(caseFold(word.charAt(i)));
+    }
+    reuse.append(FLAG_SEPARATOR);
+    reuse.append(HIDDEN_FLAG);
+    reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
+    writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
+  }
+
+  private String sortWordsOffline(
+      Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
    OfflineSorter sorter =
        new OfflineSorter(
            tempDir,
@ -908,8 +923,13 @@ public class Dictionary {
        IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
      }
    }
+    return sorted;
+  }

-    boolean success2 = false;
+  private FST<IntsRef> readSortedDictionaries(Directory tempDir, String sorted) throws IOException {
+    boolean success = false;
+
+    EntryGrouper grouper = new EntryGrouper();

    try (ByteSequencesReader reader =
        new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
@ -917,9 +937,6 @@ public class Dictionary {
      // TODO: the flags themselves can be double-chars (long) or also numeric
      // either way the trick is to encode them as char... but they must be parsed differently

-      String currentEntry = null;
-      IntsRefBuilder currentOrds = new IntsRefBuilder();
-
      while (true) {
        BytesRef scratch = reader.next();
        if (scratch == null) {
@ -959,42 +976,15 @@ public class Dictionary {
          }
        }

-        int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
-        if (cmp < 0) {
-          throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
-        } else {
-          encodeFlags(flagsScratch, wordForm);
-          int ord = flagLookup.add(flagsScratch.get());
-          if (ord < 0) {
-            // already exists in our hash
-            ord = (-ord) - 1;
-          }
-          // finalize current entry, and switch "current" if necessary
-          if (cmp > 0 && currentEntry != null) {
-            Util.toUTF32(currentEntry, scratchInts);
-            words.add(scratchInts.get(), currentOrds.get());
-          }
-          // swap current
-          if (cmp > 0) {
-            currentEntry = entry;
-            currentOrds = new IntsRefBuilder(); // must be this way
-          }
-          if (hasStemExceptions) {
-            currentOrds.append(ord);
-            currentOrds.append(stemExceptionID);
-          } else {
-            currentOrds.append(ord);
-          }
-        }
+        grouper.add(entry, wordForm, stemExceptionID);
      }

      // finalize last entry
-      assert currentEntry != null;
-      Util.toUTF32(currentEntry, scratchInts);
-      words.add(scratchInts.get(), currentOrds.get());
-      success2 = true;
+      grouper.flushGroup();
+      success = true;
+      return grouper.words.compile();
    } finally {
-      if (success2) {
+      if (success) {
        tempDir.deleteFile(sorted);
      } else {
        IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
@ -1002,6 +992,72 @@ public class Dictionary {
    }
  }

+  private class EntryGrouper {
+    final FSTCompiler<IntsRef> words =
+        new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, IntSequenceOutputs.getSingleton());
+    private final List<char[]> group = new ArrayList<>();
+    private final List<Integer> stemExceptionIDs = new ArrayList<>();
+    private final BytesRefBuilder flagsScratch = new BytesRefBuilder();
+    private final IntsRefBuilder scratchInts = new IntsRefBuilder();
+    private String currentEntry = null;
+
+    void add(String entry, char[] flags, int stemExceptionID) throws IOException {
+      if (!entry.equals(currentEntry)) {
+        if (currentEntry != null) {
+          if (entry.compareTo(currentEntry) < 0) {
+            throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
+          }
+          flushGroup();
+        }
+        currentEntry = entry;
+      }
+
+      group.add(flags);
+      if (hasStemExceptions) {
+        stemExceptionIDs.add(stemExceptionID);
+      }
+    }
+
+    void flushGroup() throws IOException {
+      IntsRefBuilder currentOrds = new IntsRefBuilder();
+
+      boolean hasNonHidden = false;
+      for (char[] flags : group) {
+        if (!hasHiddenFlag(flags)) {
+          hasNonHidden = true;
+          break;
+        }
+      }
+
+      for (int i = 0; i < group.size(); i++) {
+        char[] flags = group.get(i);
+        if (hasNonHidden && hasHiddenFlag(flags)) {
+          continue;
+        }
+
+        encodeFlags(flagsScratch, flags);
+        int ord = flagLookup.add(flagsScratch.get());
+        if (ord < 0) {
+          ord = -ord - 1; // already exists in our hash
+        }
+        currentOrds.append(ord);
+        if (hasStemExceptions) {
+          currentOrds.append(stemExceptionIDs.get(i));
+        }
+      }
+
+      Util.toUTF32(currentEntry, scratchInts);
+      words.add(scratchInts.get(), currentOrds.get());
+
+      group.clear();
+      stemExceptionIDs.clear();
+    }
+  }
+
+  static boolean hasHiddenFlag(char[] flags) {
+    return hasFlag(flags, HIDDEN_FLAG);
+  }
+
  static char[] decodeFlags(BytesRef b) {
    if (b.length == 0) {
      return CharsRef.EMPTY_CHARS;
@ -1191,9 +1247,13 @@ public class Dictionary {
  }

  CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
+    return cleanInput(input, input.length(), reuse);
+  }
+
+  private CharSequence cleanInput(CharSequence input, int prefixLength, StringBuilder reuse) {
    reuse.setLength(0);

-    for (int i = 0; i < input.length(); i++) {
+    for (int i = 0; i < prefixLength; i++) {
      char ch = input.charAt(i);

      if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -95,57 +95,30 @@ final class Stemmer {
      word = scratchBuffer;
    }

-    int caseType = caseOf(word, length);
-    if (caseType == UPPER_CASE) {
-      // upper: union exact, title, lower
+    WordCase wordCase = caseOf(word, length);
+    List<CharsRef> list = doStem(word, length, false);
+    if (wordCase == WordCase.UPPER) {
      caseFoldTitle(word, length);
-      caseFoldLower(titleBuffer, length);
-      List<CharsRef> list = doStem(word, length, false);
      list.addAll(doStem(titleBuffer, length, true));
-      list.addAll(doStem(lowerBuffer, length, true));
-      return list;
-    } else if (caseType == TITLE_CASE) {
-      // title: union exact, lower
-      caseFoldLower(word, length);
-      List<CharsRef> list = doStem(word, length, false);
-      list.addAll(doStem(lowerBuffer, length, true));
-      return list;
-    } else {
-      // exact match only
-      return doStem(word, length, false);
    }
+    if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
+      caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
+      list.addAll(doStem(lowerBuffer, length, true));
+    }
+    return list;
  }

  // temporary buffers for case variants
  private char[] lowerBuffer = new char[8];
  private char[] titleBuffer = new char[8];

-  private static final int EXACT_CASE = 0;
-  private static final int TITLE_CASE = 1;
-  private static final int UPPER_CASE = 2;
-
  /** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
-  private int caseOf(char[] word, int length) {
+  private WordCase caseOf(char[] word, int length) {
    if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) {
-      return EXACT_CASE;
+      return WordCase.MIXED;
    }

-    // determine if we are title or lowercase (or something funky, in which it's exact)
-    boolean seenUpper = false;
-    boolean seenLower = false;
-    for (int i = 1; i < length; i++) {
-      boolean v = Character.isUpperCase(word[i]);
-      seenUpper |= v;
-      seenLower |= !v;
-    }
-
-    if (!seenLower) {
-      return UPPER_CASE;
-    } else if (!seenUpper) {
-      return TITLE_CASE;
-    } else {
-      return EXACT_CASE;
-    }
+    return WordCase.caseOf(word, length);
  }

  /** folds titlecase variant of word to titleBuffer */
@ -169,25 +142,20 @@ final class Stemmer {
    IntsRef forms = dictionary.lookupWord(word, 0, length);
    if (forms != null) {
      for (int i = 0; i < forms.length; i += formStep) {
-        boolean checkKeepCase = caseVariant && dictionary.keepcase != -1;
-        boolean checkNeedAffix = dictionary.needaffix != -1;
-        boolean checkOnlyInCompound = dictionary.onlyincompound != -1;
-        if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) {
-          dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
-          char[] wordFlags = Dictionary.decodeFlags(scratch);
-          // we are looking for a case variant, but this word does not allow it
-          if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) {
-            continue;
-          }
-          // we can't add this form, it's a pseudostem requiring an affix
-          if (checkNeedAffix && Dictionary.hasFlag(wordFlags, (char) dictionary.needaffix)) {
-            continue;
-          }
-          // we can't add this form, it only belongs inside a compound word
-          if (checkOnlyInCompound
-              && Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
-            continue;
-          }
+        dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
+        char[] wordFlags = Dictionary.decodeFlags(scratch);
+        if (!acceptCase(caseVariant, wordFlags)) {
+          continue;
+        }
+        // we can't add this form, it's a pseudostem requiring an affix
+        if (dictionary.needaffix != -1
+            && Dictionary.hasFlag(wordFlags, (char) dictionary.needaffix)) {
+          continue;
+        }
+        // we can't add this form, it only belongs inside a compound word
+        if (dictionary.onlyincompound != -1
+            && Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
+          continue;
        }
        stems.add(newStem(word, length, forms, i));
      }
@ -200,6 +168,12 @@ final class Stemmer {
    return stems;
  }

+  private boolean acceptCase(boolean caseVariant, char[] wordFlags) {
+    return caseVariant
+        ? dictionary.keepcase == -1 || !Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)
+        : !Dictionary.hasHiddenFlag(wordFlags);
+  }
+
  /**
   * Find the unique stem(s) of the provided word
   *
@ -595,9 +569,7 @@ final class Stemmer {
          }

          // we are looking for a case variant, but this word does not allow it
-          if (caseVariant
-              && dictionary.keepcase != -1
-              && Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) {
+          if (!acceptCase(caseVariant, wordFlags)) {
            continue;
          }
          // we aren't decompounding (yet)
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+enum WordCase {
+  UPPER,
+  TITLE,
+  LOWER,
+  MIXED;
+
+  static WordCase caseOf(char[] word, int length) {
+    boolean capitalized = Character.isUpperCase(word[0]);
+
+    boolean seenUpper = false;
+    boolean seenLower = false;
+    for (int i = 1; i < length; i++) {
+      char ch = word[i];
+      seenUpper = seenUpper || Character.isUpperCase(ch);
+      seenLower = seenLower || Character.isLowerCase(ch);
+      if (seenUpper && seenLower) break;
+    }
+
+    return get(capitalized, seenUpper, seenLower);
+  }
+
+  static WordCase caseOf(CharSequence word, int length) {
+    boolean capitalized = Character.isUpperCase(word.charAt(0));
+
+    boolean seenUpper = false;
+    boolean seenLower = false;
+    for (int i = 1; i < length; i++) {
+      char ch = word.charAt(i);
+      seenUpper = seenUpper || Character.isUpperCase(ch);
+      seenLower = seenLower || Character.isLowerCase(ch);
+      if (seenUpper && seenLower) break;
+    }
+
+    return get(capitalized, seenUpper, seenLower);
+  }
+
+  private static WordCase get(boolean capitalized, boolean seenUpper, boolean seenLower) {
+    if (capitalized) {
+      return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED;
+    }
+    return seenUpper ? MIXED : LOWER;
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java
@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import org.junit.BeforeClass;
+
+public class TestAllCaps extends StemmerTestBase {
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    init("allcaps.aff", "allcaps.dic");
+  }
+
+  public void testGood() {
+    assertStemsTo("OpenOffice.org", "OpenOffice.org");
+    assertStemsTo("UNICEF's", "UNICEF");
+
+    // Hunspell returns these title-cased stems, so for consistency we do, too
+    assertStemsTo("OPENOFFICE.ORG", "Openoffice.org");
+    assertStemsTo("UNICEF'S", "Unicef");
+  }
+
+  public void testWrong() {
+    assertStemsTo("Openoffice.org");
+    assertStemsTo("Unicef");
+    assertStemsTo("Unicef's");
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java
@ -27,7 +27,7 @@ public class TestEscaped extends StemmerTestBase {
  public void testStemming() {
    assertStemsTo("works", "work");
    assertStemsTo("work", "work");
-    assertStemsTo("R2/D2", "R2/D2");
+    assertStemsTo("R2/D2", "R2/D2", "R2/d2");
    assertStemsTo("R2/D2s", "R2/D2");
    assertStemsTo("N/A", "N/A");
    assertStemsTo("N/As");
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.aff
@ -0,0 +1,5 @@
+# check uppercase forms of allcaps word + affix and words with mixed casing
+WORDCHARS '.
+
+SFX S N 1
+SFX S   0     's      .
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.dic
@ -0,0 +1,3 @@
+2
+OpenOffice.org
+UNICEF/S
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.aff
@ -1,4 +1,5 @@
 SET UTF-8
+WORDCHARS \/0123456789

 SFX A Y 1
 SFX A 0 s . +PLUR