LUCENE-9766: Hunspell: add API for retrieving dictionary morphologica… (#2363)

2021-02-15 20:17:43 +01:00 · 2021-02-15 20:17:43 +01:00 · f1a1165ac8
parent ee447d1516
commit f1a1165ac8
9 changed files with 243 additions and 71 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntries.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntries.java
@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.util.List;
+
+/**
+ * An object representing homonym dictionary entries. Note that the order of entries here may differ
+ * from the order in the *.dic file!
+ *
+ * @see Dictionary#lookupEntries
+ */
+public interface DictEntries {
+  /**
+   * @return a positive number of dictionary entries with the same word. Most often it's 1 (unless
+   *     there are homonyms). Entries are indexed from 0 to {@code size() - 1} and these indices can
+   *     be passed into other methods of this class.
+   */
+  int size();
+
+  /**
+   * @param entryIndex an index from 0 (inclusive) to {@link #size()} (exclusive)
+   * @return morphological fields (of {@code kk:vvvvvv} form, sorted, space-separated, excluding
+   *     {@code ph:}) associated with the homonym at the given entry index, or an empty string
+   */
+  String getMorphologicalData(int entryIndex);
+
+  /**
+   * @param entryIndex an index from 0 (inclusive) to {@link #size()} (exclusive)
+   * @param key the key in the form {@code kk:} by which to filter the morphological fields
+   * @return the values (of {@code vvvvvv} form) of morphological fields with the given key
+   *     associated with the homonym at the given entry index
+   */
+  List<String> getMorphologicalValues(int entryIndex, String key);
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -44,7 +44,6 @@ import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
-import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.store.Directory;
@ -83,7 +82,6 @@ public class Dictionary {
  // TODO: really for suffixes we should reverse the automaton and run them backwards
  private static final String PREFIX_CONDITION_REGEX = "%s.*";
  private static final String SUFFIX_CONDITION_REGEX = ".*%s";
-  private static final Pattern MORPH_KEY_PATTERN = Pattern.compile("\\s+(?=\\p{Alpha}{2}:)");
  static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
  CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET);

@ -136,15 +134,13 @@ public class Dictionary {
  private String[] morphAliases;
  private int morphAliasCount = 0;

-  // st: morphological entries (either directly, or aliased from AM)
-  private String[] stemExceptions = new String[8];
-  private int stemExceptionCount = 0;
+  final List<String> morphData = new ArrayList<>(Collections.singletonList("")); // empty data at 0

  /**
-   * we set this during sorting, so we know to add an extra FST output. when set, some words have
-   * exceptional stems, and the last entry is a pointer to stemExceptions
+   * we set this during sorting, so we know to add an extra int (index in {@link #morphData}) to FST
+   * output
   */
-  boolean hasStemExceptions;
+  boolean hasCustomMorphData;

  boolean ignoreCase;
  boolean checkSharpS;
@ -274,7 +270,7 @@ public class Dictionary {
  }

  int formStep() {
-    return hasStemExceptions ? 2 : 1;
+    return hasCustomMorphData ? 2 : 1;
  }

  /** Looks up Hunspell word forms from the dictionary */
@ -543,6 +539,44 @@ public class Dictionary {
    return false;
  }

+  /**
+   * @param root a string to look up in the dictionary. No case conversion or affix removal is
+   *     performed. To get the possible roots of any word, you may call {@link
+   *     Hunspell#getRoots(String)}
+   * @return the dictionary entries for the given root, or {@code null} if there's none
+   */
+  public DictEntries lookupEntries(String root) {
+    IntsRef forms = lookupWord(root.toCharArray(), 0, root.length());
+    if (forms == null) return null;
+
+    return new DictEntries() {
+      @Override
+      public int size() {
+        return forms.length / (hasCustomMorphData ? 2 : 1);
+      }
+
+      @Override
+      public String getMorphologicalData(int entryIndex) {
+        if (!hasCustomMorphData) return "";
+        return morphData.get(forms.ints[forms.offset + entryIndex * 2 + 1]);
+      }
+
+      @Override
+      public List<String> getMorphologicalValues(int entryIndex, String key) {
+        assert key.length() == 3;
+        assert key.charAt(2) == ':';
+
+        String fields = getMorphologicalData(entryIndex);
+        if (fields.isEmpty() || !fields.contains(key)) return Collections.emptyList();
+
+        return Arrays.stream(fields.split(" "))
+            .filter(s -> s.startsWith(key))
+            .map(s -> s.substring(3))
+            .collect(Collectors.toList());
+      }
+    };
+  }
+
  static String extractLanguageCode(String isoCode) {
    int underscore = isoCode.indexOf("_");
    return underscore < 0 ? isoCode : isoCode.substring(0, underscore);
@ -1024,11 +1058,13 @@ public class Dictionary {
            continue;
          }
          line = unescapeEntry(line);
-          // if we havent seen any stem exceptions, try to parse one
-          if (!hasStemExceptions) {
+          // if we haven't seen any custom morphological data, try to parse one
+          if (!hasCustomMorphData) {
            int morphStart = line.indexOf(MORPH_SEPARATOR);
            if (morphStart >= 0 && morphStart < line.length()) {
-              hasStemExceptions = hasStemException(line.substring(morphStart + 1));
+              String data = line.substring(morphStart + 1);
+              hasCustomMorphData =
+                  splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
            }
          }

@ -1156,6 +1192,8 @@ public class Dictionary {
      Directory tempDir, String sorted, FlagEnumerator flags) throws IOException {
    boolean success = false;

+    Map<String, Integer> morphIndices = new HashMap<>();
+
    EntryGrouper grouper = new EntryGrouper(flags);

    try (ByteSequencesReader reader =
@ -1195,20 +1233,17 @@ public class Dictionary {
          }
          entry = line.substring(0, flagSep);
        }
-        // we possibly have morphological data
-        int stemExceptionID = 0;
+
+        int morphDataID = 0;
        if (end + 1 < line.length()) {
-          String morphData = line.substring(end + 1);
-          for (String datum : splitMorphData(morphData)) {
-            if (datum.startsWith("st:")) {
-              stemExceptionID = addStemException(datum.substring(3));
-            } else if (datum.startsWith("ph:") && datum.length() > 3) {
-              addPhoneticRepEntries(entry, datum.substring(3));
-            }
+          List<String> morphFields = readMorphFields(entry, line.substring(end + 1));
+          if (!morphFields.isEmpty()) {
+            morphFields.sort(Comparator.naturalOrder());
+            morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields));
          }
        }

-        grouper.add(entry, wordForm, stemExceptionID);
+        grouper.add(entry, wordForm, morphDataID);
      }

      // finalize last entry
@ -1224,10 +1259,29 @@ public class Dictionary {
    }
  }

-  private int addStemException(String stemException) {
-    stemExceptions = ArrayUtil.grow(stemExceptions, stemExceptionCount + 1);
-    stemExceptions[stemExceptionCount++] = stemException;
-    return stemExceptionCount; // we use '0' to indicate no exception for the form
+  private List<String> readMorphFields(String word, String unparsed) {
+    List<String> morphFields = null;
+    for (String datum : splitMorphData(unparsed)) {
+      if (datum.startsWith("ph:")) {
+        addPhoneticRepEntries(word, datum.substring(3));
+      } else {
+        if (morphFields == null) morphFields = new ArrayList<>(1);
+        morphFields.add(datum);
+      }
+    }
+    return morphFields == null ? Collections.emptyList() : morphFields;
+  }
+
+  private int addMorphFields(Map<String, Integer> indices, String morphFields) {
+    Integer alreadyCached = indices.get(morphFields);
+    if (alreadyCached != null) {
+      return alreadyCached;
+    }
+
+    int index = morphData.size();
+    indices.put(morphFields, index);
+    morphData.add(morphFields);
+    return index;
  }

  private void addPhoneticRepEntries(String word, String ph) {
@ -1278,7 +1332,7 @@ public class Dictionary {
    final FSTCompiler<IntsRef> words =
        new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, IntSequenceOutputs.getSingleton());
    private final List<char[]> group = new ArrayList<>();
-    private final List<Integer> stemExceptionIDs = new ArrayList<>();
+    private final List<Integer> morphDataIDs = new ArrayList<>();
    private final IntsRefBuilder scratchInts = new IntsRefBuilder();
    private String currentEntry = null;
    private final FlagEnumerator flagEnumerator;
@ -1287,7 +1341,7 @@ public class Dictionary {
      this.flagEnumerator = flagEnumerator;
    }

-    void add(String entry, char[] flags, int stemExceptionID) throws IOException {
+    void add(String entry, char[] flags, int morphDataID) throws IOException {
      if (!entry.equals(currentEntry)) {
        if (currentEntry != null) {
          if (entry.compareTo(currentEntry) < 0) {
@ -1299,8 +1353,8 @@ public class Dictionary {
      }

      group.add(flags);
-      if (hasStemExceptions) {
-        stemExceptionIDs.add(stemExceptionID);
+      if (hasCustomMorphData) {
+        morphDataIDs.add(morphDataID);
      }
    }

@ -1322,8 +1376,8 @@ public class Dictionary {
        }

        currentOrds.append(flagEnumerator.add(flags));
-        if (hasStemExceptions) {
-          currentOrds.append(stemExceptionIDs.get(i));
+        if (hasCustomMorphData) {
+          currentOrds.append(morphDataIDs.get(i));
        }
      }

@ -1331,7 +1385,7 @@ public class Dictionary {
      words.add(scratchInts.get(), currentOrds.get());

      group.clear();
-      stemExceptionIDs.clear();
+      morphDataIDs.clear();
    }
  }

@ -1365,10 +1419,6 @@ public class Dictionary {
    }
  }

-  String getStemException(int id) {
-    return stemExceptions[id - 1];
-  }
-
  private void parseMorphAlias(String line) {
    if (morphAliases == null) {
      // first line should be the aliases count
@ -1380,15 +1430,6 @@ public class Dictionary {
    }
  }

-  private boolean hasStemException(String morphData) {
-    for (String datum : splitMorphData(morphData)) {
-      if (datum.startsWith("st:")) {
-        return true;
-      }
-    }
-    return false;
-  }
-
  private List<String> splitMorphData(String morphData) {
    // first see if it's an alias
    if (morphAliasCount > 0) {
@ -1401,9 +1442,13 @@ public class Dictionary {
    if (morphData.isBlank()) {
      return Collections.emptyList();
    }
-    return Arrays.stream(MORPH_KEY_PATTERN.split(morphData))
-        .map(String::trim)
-        .filter(s -> !s.isBlank())
+    return Arrays.stream(morphData.split("\\s+"))
+        .filter(
+            s ->
+                s.length() > 3
+                    && Character.isLetter(s.charAt(0))
+                    && Character.isLetter(s.charAt(1))
+                    && s.charAt(2) == ':')
        .collect(Collectors.toList());
  }

--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
@ -156,7 +156,7 @@ public class Hunspell {
        length,
        originalCase,
        context,
-        (stem, formID, stemException) -> {
+        (stem, formID, morphDataId) -> {
          if (acceptsStem(formID)) {
            result[0] = new Root<>(stem, formID);
          }
@ -253,6 +253,24 @@ public class Hunspell {
    return cr1.toString().equalsIgnoreCase(cr2.toString());
  }

+  /**
+   * Find all roots that could result in the given word after case conversion and adding affixes.
+   * This corresponds to the original {@code hunspell -s} (stemming) functionality.
+   *
+   * <p>Some affix rules are relaxed in this stemming process: e.g. explicitly forbidden words are
+   * still returned. Some of the returned roots may be synthetic and not directly occur in the *.dic
+   * file (but differ from some existing entries in case). No roots are returned for compound words.
+   *
+   * <p>The returned roots may be used to retrieve morphological data via {@link
+   * Dictionary#lookupEntries}.
+   */
+  public List<String> getRoots(String word) {
+    return stemmer.stem(word).stream()
+        .map(CharsRef::toString)
+        .distinct()
+        .collect(Collectors.toList());
+  }
+
  private class CompoundPart {
    final CompoundPart prev;
    final int index, length;
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -343,23 +343,28 @@ final class Stemmer {
     * @param stem the text of the found dictionary entry
     * @param formID internal id of the dictionary entry, e.g. to be used in {@link
     *     Dictionary#hasFlag(int, char)}
-     * @param stemException "st:" morphological data if present, {@code null} otherwise
+     * @param morphDataId the id of the custom morphological data (0 if none), to be used with
+     *     {@link Dictionary#morphData}
     * @return whether the processing should be continued
     */
-    boolean processRoot(CharsRef stem, int formID, String stemException);
+    boolean processRoot(CharsRef stem, int formID, int morphDataId);
  }

-  private String stemException(IntsRef forms, int formIndex) {
-    if (dictionary.hasStemExceptions) {
-      int exceptionID = forms.ints[forms.offset + formIndex + 1];
-      if (exceptionID > 0) {
-        return dictionary.getStemException(exceptionID);
+  private String stemException(int morphDataId) {
+    if (morphDataId > 0) {
+      String data = dictionary.morphData.get(morphDataId);
+      int start = data.startsWith("st:") ? 0 : data.indexOf(" st:");
+      if (start >= 0) {
+        int nextSpace = data.indexOf(' ', start + 3);
+        return data.substring(start + 3, nextSpace < 0 ? data.length() : nextSpace);
      }
    }
    return null;
  }

-  private CharsRef newStem(CharsRef stem, String exception) {
+  private CharsRef newStem(CharsRef stem, int morphDataId) {
+    String exception = stemException(morphDataId);
+
    if (dictionary.needsOutputCleaning) {
      scratchSegment.setLength(0);
      if (exception != null) {
@ -759,7 +764,8 @@ final class Stemmer {
  private boolean callProcessor(
      char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
    CharsRef stem = new CharsRef(word, offset, length);
-    return processor.processRoot(stem, forms.ints[forms.offset + i], stemException(forms, i));
+    int morphDataId = dictionary.hasCustomMorphData ? forms.ints[forms.offset + i + 1] : 0;
+    return processor.processRoot(stem, forms.ints[forms.offset + i], morphDataId);
  }

  private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix, int prefixId) {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/StemmerTestBase.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/StemmerTestBase.java
@ -43,6 +43,11 @@ public abstract class StemmerTestBase extends LuceneTestCase {

  static void init(boolean ignoreCase, String affix, String... dictionaries)
      throws IOException, ParseException {
+    stemmer = new Stemmer(loadDictionary(ignoreCase, affix, dictionaries));
+  }
+
+  static Dictionary loadDictionary(boolean ignoreCase, String affix, String... dictionaries)
+      throws IOException, ParseException {
    if (dictionaries.length == 0) {
      throw new IllegalArgumentException("there must be at least one dictionary");
    }
@ -52,7 +57,7 @@ public abstract class StemmerTestBase extends LuceneTestCase {
      throw new FileNotFoundException("file not found: " + affix);
    }

-    InputStream dictStreams[] = new InputStream[dictionaries.length];
+    InputStream[] dictStreams = new InputStream[dictionaries.length];
    for (int i = 0; i < dictionaries.length; i++) {
      dictStreams[i] = StemmerTestBase.class.getResourceAsStream(dictionaries[i]);
      if (dictStreams[i] == null) {
@ -61,14 +66,12 @@ public abstract class StemmerTestBase extends LuceneTestCase {
    }

    try {
-      Dictionary dictionary =
-          new Dictionary(
+      return new Dictionary(
          new ByteBuffersDirectory(),
          "dictionary",
          affixStream,
          Arrays.asList(dictStreams),
          ignoreCase);
-      stemmer = new Stemmer(dictionary);
    } finally {
      IOUtils.closeWhileHandlingException(affixStream);
      IOUtils.closeWhileHandlingException(dictStreams);
@ -80,7 +83,7 @@ public abstract class StemmerTestBase extends LuceneTestCase {
    Arrays.sort(expected);

    List<CharsRef> stems = stemmer.stem(s);
-    String actual[] = new String[stems.size()];
+    String[] actual = new String[stems.size()];
    for (int i = 0; i < actual.length; i++) {
      actual[i] = stems.get(i).toString();
    }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
@ -205,6 +205,7 @@ public class TestAllDictionaries extends LuceneTestCase {
        + ("strips=" + RamUsageTester.humanSizeOf(dic.stripData) + ", ")
        + ("conditions=" + RamUsageTester.humanSizeOf(dic.patterns) + ", ")
        + ("affixData=" + RamUsageTester.humanSizeOf(dic.affixData) + ", ")
+        + ("morphData=" + RamUsageTester.humanSizeOf(dic.morphData) + ", ")
        + ("prefixes=" + RamUsageTester.humanSizeOf(dic.prefixes) + ", ")
        + ("suffixes=" + RamUsageTester.humanSizeOf(dic.suffixes) + ")");
  }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@ -22,6 +22,10 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.text.ParseException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
 import org.apache.lucene.store.ByteBuffersDirectory;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.CharsRef;
@ -244,6 +248,35 @@ public class TestDictionary extends LuceneTestCase {
    assertEquals(src, new String(strategy.parseFlags(asAscii)));
  }

+  @Test
+  public void testCustomMorphologicalData() throws IOException, ParseException {
+    Dictionary dic = loadDictionary("morphdata.aff", "morphdata.dic");
+    assertNull(dic.lookupEntries("nonexistent"));
+
+    DictEntries simpleNoun = dic.lookupEntries("simplenoun");
+    assertEquals(1, simpleNoun.size());
+    assertEquals(Collections.emptyList(), simpleNoun.getMorphologicalValues(0, "aa:"));
+    assertEquals(Collections.singletonList("42"), simpleNoun.getMorphologicalValues(0, "fr:"));
+
+    DictEntries lay = dic.lookupEntries("lay");
+    String actual =
+        IntStream.range(0, 3)
+            .mapToObj(lay::getMorphologicalData)
+            .sorted()
+            .collect(Collectors.joining("; "));
+    assertEquals("is:past_2 po:verb st:lie; is:present po:verb; po:noun", actual);
+
+    DictEntries sing = dic.lookupEntries("sing");
+    assertEquals(1, sing.size());
+    assertEquals(Arrays.asList("sang", "sung"), sing.getMorphologicalValues(0, "al:"));
+
+    assertEquals(
+        "al:abaléar po:verbo ts:transitiva",
+        dic.lookupEntries("unsupported1").getMorphologicalData(0));
+
+    assertEquals("", dic.lookupEntries("unsupported2").getMorphologicalData(0));
+  }
+
  private Directory getDirectory() {
    return newDirectory();
  }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java
@ -16,7 +16,11 @@
 */
 package org.apache.lucene.analysis.hunspell;

+import java.io.IOException;
+import java.text.ParseException;
+import java.util.Collections;
 import org.junit.BeforeClass;
+import org.junit.Test;

 public class TestStemmer extends StemmerTestBase {

@ -58,6 +62,13 @@ public class TestStemmer extends StemmerTestBase {
    assertStemsTo("solr", "olr");
  }

+  @Test
+  public void testHunspellStemmingApi() throws IOException, ParseException {
+    Hunspell hunspell = new Hunspell(loadDictionary(false, "simple.aff", "simple.dic"));
+    assertEquals(Collections.singletonList("apach"), hunspell.getRoots("apache"));
+    assertEquals(Collections.singletonList("foo"), hunspell.getRoots("foo"));
+  }
+
  // some bogus stuff that should not stem (empty lists)!
  public void testBogusStems() {
    assertStemsTo("abs");
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/morphdata.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/morphdata.dic
@ -1,6 +1,12 @@
-5
+11
 feet/X st:foot
 work/A    st:workverb  
 work/B    st:worknoun
 notspecial
-simplenoun/A
+simplenoun/A  fr:42
+sing al:sang al:sung
+lay po:verb st:lie is:past_2
+lay po:verb is:present
+lay po:noun
+unsupported1 po:verbo ts:transitiva / intransitiva / pronominal VOLG: t i pr al:abaléar
+unsupported2	[CAT=nc,G=f,N=s]