LUCENE-9676: Hunspell: improve stemming of all-caps words (#2217)

Hunspell: improve stemming of all-caps words Repeat Hunspell's logic: * when encountering a mixed- or (inflectable) all-case dictionary entry, add its title-case analog as a hidden entry * use that hidden entry for stemming case variants for title- and uppercase words, but don't consider it a valid word itself * ...unless there's another explicit dictionary entry of that title case
2021-01-19 09:32:23 +01:00 · 2021-01-19 09:32:23 +01:00 · 422c89baef
parent c1ae6dc07c
commit 422c89baef
9 changed files with 283 additions and 138 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -84,7 +84,8 @@ API Changes
 Improvements
-* LUCENE-9665: Hunspell: support default encoding (Peter Gromov)
+* LUCENE-9665 LUCENE-9676 Hunspell improvements: support default encoding, improve stemming of all-caps words
  (Peter Gromov)
 * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
  (Dawid Weiss)
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -73,6 +73,8 @@ public class Dictionary {
  static final char[] NOFLAGS = new char[0];
  private static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell
  private static final String ALIAS_KEY = "AF";
  private static final String MORPH_ALIAS_KEY = "AM";
  private static final String PREFIX_KEY = "PFX";
@ -238,10 +240,9 @@ public class Dictionary {
      readAffixFile(aff2, decoder);
      // read dictionary entries
-      IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
+      IndexOutput unsorted = mergeDictionaries(tempDir, tempFileNamePrefix, dictionaries, decoder);
-      FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, o);
+      String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
-      readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, fstCompiler);
+      words = readSortedDictionaries(tempDir, sortedFile);
      words = fstCompiler.compile();
      aliases = null; // no longer needed
      morphAliases = null; // no longer needed
      success = true;
@ -791,25 +792,13 @@ public class Dictionary {
    }
  }
-  /**
+  private IndexOutput mergeDictionaries(
   * Reads the dictionary file through the provided InputStreams, building up the words map
   *
   * @param dictionaries InputStreams to read the dictionary file through
   * @param decoder CharsetDecoder used to decode the contents of the file
   * @throws IOException Can be thrown while reading from the file
   */
  private void readDictionaryFiles(
      Directory tempDir,
      String tempFileNamePrefix,
      List<InputStream> dictionaries,
-      CharsetDecoder decoder,
+      CharsetDecoder decoder)
      FSTCompiler<IntsRef> words)
      throws IOException {
    BytesRefBuilder flagsScratch = new BytesRefBuilder();
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    StringBuilder sb = new StringBuilder();
    IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
    try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
      for (InputStream dictionary : dictionaries) {
@ -833,32 +822,58 @@ public class Dictionary {
              hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
            }
          }
-          if (needsInputCleaning) {
+
-            int flagSep = line.indexOf(FLAG_SEPARATOR);
+          writeNormalizedWordEntry(sb, writer, line);
            if (flagSep == -1) {
              flagSep = line.indexOf(MORPH_SEPARATOR);
            }
            if (flagSep == -1) {
              CharSequence cleansed = cleanInput(line, sb);
              writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
            } else {
              String text = line.substring(0, flagSep);
              CharSequence cleansed = cleanInput(text, sb);
              if (cleansed != sb) {
                sb.setLength(0);
                sb.append(cleansed);
              }
              sb.append(line.substring(flagSep));
              writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
            }
          } else {
            writer.write(line.getBytes(StandardCharsets.UTF_8));
          }
        }
      }
      CodecUtil.writeFooter(unsorted);
    }
    return unsorted;
  }
  private void writeNormalizedWordEntry(
      StringBuilder reuse, ByteSequencesWriter writer, String line) throws IOException {
    int flagSep = line.indexOf(FLAG_SEPARATOR);
    int morphSep = line.indexOf(MORPH_SEPARATOR);
    assert morphSep > 0;
    assert morphSep > flagSep;
    int sep = flagSep < 0 ? morphSep : flagSep;
    CharSequence toWrite;
    if (needsInputCleaning) {
      cleanInput(line, sep, reuse);
      reuse.append(line, sep, line.length());
      toWrite = reuse;
    } else {
      toWrite = line;
    }
    String written = toWrite.toString();
    sep = written.length() - (line.length() - sep);
    writer.write(written.getBytes(StandardCharsets.UTF_8));
    WordCase wordCase = WordCase.caseOf(written, sep);
    if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
      addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep));
    }
  }
  private void addHiddenCapitalizedWord(
      StringBuilder reuse, ByteSequencesWriter writer, String word, String afterSep)
      throws IOException {
    reuse.setLength(0);
    reuse.append(Character.toUpperCase(word.charAt(0)));
    for (int i = 1; i < word.length(); i++) {
      reuse.append(caseFold(word.charAt(i)));
    }
    reuse.append(FLAG_SEPARATOR);
    reuse.append(HIDDEN_FLAG);
    reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
    writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
  }
  private String sortWordsOffline(
      Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
    OfflineSorter sorter =
        new OfflineSorter(
            tempDir,
@ -908,8 +923,13 @@ public class Dictionary {
        IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
      }
    }
    return sorted;
  }
-    boolean success2 = false;
+  private FST<IntsRef> readSortedDictionaries(Directory tempDir, String sorted) throws IOException {
    boolean success = false;
    EntryGrouper grouper = new EntryGrouper();
    try (ByteSequencesReader reader =
        new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
@ -917,9 +937,6 @@ public class Dictionary {
      // TODO: the flags themselves can be double-chars (long) or also numeric
      // either way the trick is to encode them as char... but they must be parsed differently
      String currentEntry = null;
      IntsRefBuilder currentOrds = new IntsRefBuilder();
      while (true) {
        BytesRef scratch = reader.next();
        if (scratch == null) {
@ -959,42 +976,15 @@ public class Dictionary {
          }
        }
-        int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
+        grouper.add(entry, wordForm, stemExceptionID);
        if (cmp < 0) {
          throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
        } else {
          encodeFlags(flagsScratch, wordForm);
          int ord = flagLookup.add(flagsScratch.get());
          if (ord < 0) {
            // already exists in our hash
            ord = (-ord) - 1;
          }
          // finalize current entry, and switch "current" if necessary
          if (cmp > 0 && currentEntry != null) {
            Util.toUTF32(currentEntry, scratchInts);
            words.add(scratchInts.get(), currentOrds.get());
          }
          // swap current
          if (cmp > 0) {
            currentEntry = entry;
            currentOrds = new IntsRefBuilder(); // must be this way
          }
          if (hasStemExceptions) {
            currentOrds.append(ord);
            currentOrds.append(stemExceptionID);
          } else {
            currentOrds.append(ord);
          }
        }
      }
      // finalize last entry
-      assert currentEntry != null;
+      grouper.flushGroup();
-      Util.toUTF32(currentEntry, scratchInts);
+      success = true;
-      words.add(scratchInts.get(), currentOrds.get());
+      return grouper.words.compile();
      success2 = true;
    } finally {
-      if (success2) {
+      if (success) {
        tempDir.deleteFile(sorted);
      } else {
        IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
@ -1002,6 +992,72 @@ public class Dictionary {
    }
  }
  private class EntryGrouper {
    final FSTCompiler<IntsRef> words =
        new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, IntSequenceOutputs.getSingleton());
    private final List<char[]> group = new ArrayList<>();
    private final List<Integer> stemExceptionIDs = new ArrayList<>();
    private final BytesRefBuilder flagsScratch = new BytesRefBuilder();
    private final IntsRefBuilder scratchInts = new IntsRefBuilder();
    private String currentEntry = null;
    void add(String entry, char[] flags, int stemExceptionID) throws IOException {
      if (!entry.equals(currentEntry)) {
        if (currentEntry != null) {
          if (entry.compareTo(currentEntry) < 0) {
            throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
          }
          flushGroup();
        }
        currentEntry = entry;
      }
      group.add(flags);
      if (hasStemExceptions) {
        stemExceptionIDs.add(stemExceptionID);
      }
    }
    void flushGroup() throws IOException {
      IntsRefBuilder currentOrds = new IntsRefBuilder();
      boolean hasNonHidden = false;
      for (char[] flags : group) {
        if (!hasHiddenFlag(flags)) {
          hasNonHidden = true;
          break;
        }
      }
      for (int i = 0; i < group.size(); i++) {
        char[] flags = group.get(i);
        if (hasNonHidden && hasHiddenFlag(flags)) {
          continue;
        }
        encodeFlags(flagsScratch, flags);
        int ord = flagLookup.add(flagsScratch.get());
        if (ord < 0) {
          ord = -ord - 1; // already exists in our hash
        }
        currentOrds.append(ord);
        if (hasStemExceptions) {
          currentOrds.append(stemExceptionIDs.get(i));
        }
      }
      Util.toUTF32(currentEntry, scratchInts);
      words.add(scratchInts.get(), currentOrds.get());
      group.clear();
      stemExceptionIDs.clear();
    }
  }
  static boolean hasHiddenFlag(char[] flags) {
    return hasFlag(flags, HIDDEN_FLAG);
  }
  static char[] decodeFlags(BytesRef b) {
    if (b.length == 0) {
      return CharsRef.EMPTY_CHARS;
@ -1191,9 +1247,13 @@ public class Dictionary {
  }
  CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
    return cleanInput(input, input.length(), reuse);
  }
  private CharSequence cleanInput(CharSequence input, int prefixLength, StringBuilder reuse) {
    reuse.setLength(0);
-    for (int i = 0; i < input.length(); i++) {
+    for (int i = 0; i < prefixLength; i++) {
      char ch = input.charAt(i);
      if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -95,57 +95,30 @@ final class Stemmer {
      word = scratchBuffer;
    }
-    int caseType = caseOf(word, length);
+    WordCase wordCase = caseOf(word, length);
-    if (caseType == UPPER_CASE) {
+    List<CharsRef> list = doStem(word, length, false);
-      // upper: union exact, title, lower
+    if (wordCase == WordCase.UPPER) {
      caseFoldTitle(word, length);
      caseFoldLower(titleBuffer, length);
      List<CharsRef> list = doStem(word, length, false);
      list.addAll(doStem(titleBuffer, length, true));
      list.addAll(doStem(lowerBuffer, length, true));
      return list;
    } else if (caseType == TITLE_CASE) {
      // title: union exact, lower
      caseFoldLower(word, length);
      List<CharsRef> list = doStem(word, length, false);
      list.addAll(doStem(lowerBuffer, length, true));
      return list;
    } else {
      // exact match only
      return doStem(word, length, false);
    }
    if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
      caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
      list.addAll(doStem(lowerBuffer, length, true));
    }
    return list;
  }
  // temporary buffers for case variants
  private char[] lowerBuffer = new char[8];
  private char[] titleBuffer = new char[8];
  private static final int EXACT_CASE = 0;
  private static final int TITLE_CASE = 1;
  private static final int UPPER_CASE = 2;
  /** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
-  private int caseOf(char[] word, int length) {
+  private WordCase caseOf(char[] word, int length) {
    if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) {
-      return EXACT_CASE;
+      return WordCase.MIXED;
    }
-    // determine if we are title or lowercase (or something funky, in which it's exact)
+    return WordCase.caseOf(word, length);
    boolean seenUpper = false;
    boolean seenLower = false;
    for (int i = 1; i < length; i++) {
      boolean v = Character.isUpperCase(word[i]);
      seenUpper |= v;
      seenLower |= !v;
    }
    if (!seenLower) {
      return UPPER_CASE;
    } else if (!seenUpper) {
      return TITLE_CASE;
    } else {
      return EXACT_CASE;
    }
  }
  /** folds titlecase variant of word to titleBuffer */
@ -169,26 +142,21 @@ final class Stemmer {
    IntsRef forms = dictionary.lookupWord(word, 0, length);
    if (forms != null) {
      for (int i = 0; i < forms.length; i += formStep) {
        boolean checkKeepCase = caseVariant && dictionary.keepcase != -1;
        boolean checkNeedAffix = dictionary.needaffix != -1;
        boolean checkOnlyInCompound = dictionary.onlyincompound != -1;
        if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) {
        dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
        char[] wordFlags = Dictionary.decodeFlags(scratch);
-          // we are looking for a case variant, but this word does not allow it
+        if (!acceptCase(caseVariant, wordFlags)) {
          if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) {
          continue;
        }
        // we can't add this form, it's a pseudostem requiring an affix
-          if (checkNeedAffix && Dictionary.hasFlag(wordFlags, (char) dictionary.needaffix)) {
+        if (dictionary.needaffix != -1
            && Dictionary.hasFlag(wordFlags, (char) dictionary.needaffix)) {
          continue;
        }
        // we can't add this form, it only belongs inside a compound word
-          if (checkOnlyInCompound
+        if (dictionary.onlyincompound != -1
            && Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
          continue;
        }
        }
        stems.add(newStem(word, length, forms, i));
      }
    }
@ -200,6 +168,12 @@ final class Stemmer {
    return stems;
  }
  private boolean acceptCase(boolean caseVariant, char[] wordFlags) {
    return caseVariant
        ? dictionary.keepcase == -1 || !Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)
        : !Dictionary.hasHiddenFlag(wordFlags);
  }
  /**
   * Find the unique stem(s) of the provided word
   *
@ -595,9 +569,7 @@ final class Stemmer {
          }
          // we are looking for a case variant, but this word does not allow it
-          if (caseVariant
+          if (!acceptCase(caseVariant, wordFlags)) {
              && dictionary.keepcase != -1
              && Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) {
            continue;
          }
          // we aren't decompounding (yet)
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
@ -0,0 +1,61 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.hunspell;
 enum WordCase {
  UPPER,
  TITLE,
  LOWER,
  MIXED;
  static WordCase caseOf(char[] word, int length) {
    boolean capitalized = Character.isUpperCase(word[0]);
    boolean seenUpper = false;
    boolean seenLower = false;
    for (int i = 1; i < length; i++) {
      char ch = word[i];
      seenUpper = seenUpper || Character.isUpperCase(ch);
      seenLower = seenLower || Character.isLowerCase(ch);
      if (seenUpper && seenLower) break;
    }
    return get(capitalized, seenUpper, seenLower);
  }
  static WordCase caseOf(CharSequence word, int length) {
    boolean capitalized = Character.isUpperCase(word.charAt(0));
    boolean seenUpper = false;
    boolean seenLower = false;
    for (int i = 1; i < length; i++) {
      char ch = word.charAt(i);
      seenUpper = seenUpper || Character.isUpperCase(ch);
      seenLower = seenLower || Character.isLowerCase(ch);
      if (seenUpper && seenLower) break;
    }
    return get(capitalized, seenUpper, seenLower);
  }
  private static WordCase get(boolean capitalized, boolean seenUpper, boolean seenLower) {
    if (capitalized) {
      return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED;
    }
    return seenUpper ? MIXED : LOWER;
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java
@ -0,0 +1,42 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.hunspell;
 import org.junit.BeforeClass;
 public class TestAllCaps extends StemmerTestBase {
  @BeforeClass
  public static void beforeClass() throws Exception {
    init("allcaps.aff", "allcaps.dic");
  }
  public void testGood() {
    assertStemsTo("OpenOffice.org", "OpenOffice.org");
    assertStemsTo("UNICEF's", "UNICEF");
    // Hunspell returns these title-cased stems, so for consistency we do, too
    assertStemsTo("OPENOFFICE.ORG", "Openoffice.org");
    assertStemsTo("UNICEF'S", "Unicef");
  }
  public void testWrong() {
    assertStemsTo("Openoffice.org");
    assertStemsTo("Unicef");
    assertStemsTo("Unicef's");
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java
@ -27,7 +27,7 @@ public class TestEscaped extends StemmerTestBase {
  public void testStemming() {
    assertStemsTo("works", "work");
    assertStemsTo("work", "work");
-    assertStemsTo("R2/D2", "R2/D2");
+    assertStemsTo("R2/D2", "R2/D2", "R2/d2");
    assertStemsTo("R2/D2s", "R2/D2");
    assertStemsTo("N/A", "N/A");
    assertStemsTo("N/As");
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.aff
@ -0,0 +1,5 @@
 # check uppercase forms of allcaps word + affix and words with mixed casing
 WORDCHARS '.
 SFX S N 1
 SFX S   0     's      .
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.dic
@ -0,0 +1,3 @@
 2
 OpenOffice.org
 UNICEF/S
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.aff
@ -1,4 +1,5 @@
 SET UTF-8
 WORDCHARS \/0123456789
 SFX A Y 1
 SFX A 0 s . +PLUR