LUCENE-9701: Hunspell: implement simple REP-based suggestion algorithm (#2251)

2021-02-01 10:23:54 +01:00 · 2021-02-01 10:23:54 +01:00 · 8a34cc7afd
parent 9d45dfe776
commit 8a34cc7afd
11 changed files with 276 additions and 3 deletions
--- a/gradle/validation/rat-sources.gradle
+++ b/gradle/validation/rat-sources.gradle
@ -56,6 +56,7 @@ configure(project(":lucene:analysis:common")) {
            "**/*.dic",
            "**/*.wrong",
            "**/*.good",
+            "**/*.sug",
            "**/charfilter/*.htm*",
            "**/*LuceneResourcesWikiPage.html"
        ]
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -152,6 +152,9 @@ public class Dictionary {
  // ignored characters (dictionary, affix, inputs)
  private char[] ignore;

+  String tryChars = "";
+  List<RepEntry> repTable = new ArrayList<>();
+
  // FSTs used for ICONV/OCONV, output ord pointing to replacement text
  FST<CharsRef> iconv;
  FST<CharsRef> oconv;
@ -383,6 +386,14 @@ public class Dictionary {
        alternateCasing = langCode.equals("tr") || langCode.equals("az");
      } else if ("BREAK".equals(firstWord)) {
        breaks = parseBreaks(reader, line);
+      } else if ("TRY".equals(firstWord)) {
+        tryChars = singleArgument(reader, line);
+      } else if ("REP".equals(firstWord)) {
+        int count = Integer.parseInt(singleArgument(reader, line));
+        for (int i = 0; i < count; i++) {
+          String[] parts = splitBySpace(reader, reader.readLine(), 3);
+          repTable.add(new RepEntry(parts[1], parts[2]));
+        }
      } else if ("FORBIDDENWORD".equals(firstWord)) {
        forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
      } else if ("COMPOUNDMIN".equals(firstWord)) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.util.Arrays;
+import java.util.LinkedHashSet;
+
+class ModifyingSuggester {
+  private final LinkedHashSet<String> result = new LinkedHashSet<>();
+  private final char[] tryChars;
+  private final SpellChecker speller;
+
+  ModifyingSuggester(SpellChecker speller) {
+    this.speller = speller;
+    tryChars = speller.dictionary.tryChars.toCharArray();
+  }
+
+  LinkedHashSet<String> suggest(String word) {
+    tryRep(word);
+    tryAddingChar(word);
+    return result;
+  }
+
+  private void tryRep(String word) {
+    for (RepEntry entry : speller.dictionary.repTable) {
+      for (String candidate : entry.substitute(word)) {
+        if (trySuggestion(candidate)) {
+          continue;
+        }
+
+        if (candidate.contains(" ")
+            && Arrays.stream(candidate.split(" ")).allMatch(speller::checkWord)) {
+          result.add(candidate);
+        }
+      }
+    }
+  }
+
+  private void tryAddingChar(String word) {
+    for (int i = 0; i <= word.length(); i++) {
+      String prefix = word.substring(0, i);
+      String suffix = word.substring(i);
+      for (char toInsert : tryChars) {
+        trySuggestion(prefix + toInsert + suffix);
+      }
+    }
+  }
+
+  private boolean trySuggestion(String candidate) {
+    if (speller.checkWord(candidate)) {
+      result.add(candidate);
+      return true;
+    }
+    return false;
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/RepEntry.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/RepEntry.java
@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+class RepEntry {
+  private final String pattern;
+  private final String replacement;
+  private final boolean mustStart;
+  private final boolean mustEnd;
+  private final int patternLen;
+
+  RepEntry(String rawPattern, String rawReplacement) {
+    mustStart = rawPattern.startsWith("^");
+    mustEnd = rawPattern.endsWith("$");
+    pattern = rawPattern.substring(mustStart ? 1 : 0, rawPattern.length() - (mustEnd ? 1 : 0));
+    replacement = rawReplacement.replace('_', ' ');
+    patternLen = pattern.length();
+  }
+
+  List<String> substitute(String word) {
+    if (mustStart) {
+      boolean matches = mustEnd ? word.equals(pattern) : word.startsWith(pattern);
+      return matches
+          ? Collections.singletonList(replacement + word.substring(patternLen))
+          : Collections.emptyList();
+    }
+
+    if (mustEnd) {
+      return word.endsWith(pattern)
+          ? Collections.singletonList(word.substring(0, word.length() - patternLen) + replacement)
+          : Collections.emptyList();
+    }
+
+    int pos = word.indexOf(pattern);
+    if (pos < 0) return Collections.emptyList();
+
+    List<String> result = new ArrayList<>();
+    while (pos >= 0) {
+      result.add(word.substring(0, pos) + replacement + word.substring(pos + patternLen));
+      pos = word.indexOf(pattern, pos + 1);
+    }
+    return result;
+  }
+
+  @Override
+  public String toString() {
+    return (mustStart ? "^" : "") + pattern + (mustEnd ? "$" : "") + "->" + replacement;
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@ -23,7 +23,9 @@ import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
 import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;

 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
+import java.util.Set;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
@ -34,9 +36,9 @@ import org.apache.lucene.util.IntsRef;
 * threads). Not all Hunspell features are supported yet.
 */
 public class SpellChecker {
-  private final Dictionary dictionary;
+  final Dictionary dictionary;
+  final Stemmer stemmer;
  private final BytesRef scratch = new BytesRef();
-  private final Stemmer stemmer;

  public SpellChecker(Dictionary dictionary) {
    this.dictionary = dictionary;
@ -128,6 +130,10 @@ public class SpellChecker {
    return false;
  }

+  boolean checkWord(String word) {
+    return checkWord(word.toCharArray(), word.length(), null);
+  }
+
  private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
    if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
      return false;
@ -329,4 +335,44 @@ public class SpellChecker {
        && spell(word.substring(0, breakPos))
        && spell(word.substring(breakPos + breakStr.length()));
  }
+
+  public List<String> suggest(String word) {
+    if (word.length() >= 100) return Collections.emptyList();
+
+    if (dictionary.needsInputCleaning) {
+      word = dictionary.cleanInput(word, new StringBuilder()).toString();
+    }
+
+    ModifyingSuggester modifier = new ModifyingSuggester(this);
+    Set<String> result = modifier.suggest(word);
+
+    if (word.contains("-") && result.stream().noneMatch(s -> s.contains("-"))) {
+      result.addAll(modifyChunksBetweenDashes(word));
+    }
+
+    return new ArrayList<>(result);
+  }
+
+  private List<String> modifyChunksBetweenDashes(String word) {
+    List<String> result = new ArrayList<>();
+    int chunkStart = 0;
+    while (chunkStart < word.length()) {
+      int chunkEnd = word.indexOf('-', chunkStart);
+      if (chunkEnd < 0) {
+        chunkEnd = word.length();
+      }
+
+      if (chunkEnd > chunkStart) {
+        String chunk = word.substring(chunkStart, chunkEnd);
+        if (!spell(chunk)) {
+          for (String chunkSug : suggest(chunk)) {
+            result.add(word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd));
+          }
+        }
+      }
+
+      chunkStart = chunkEnd + 1;
+    }
+    return result;
+  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@ -20,7 +20,9 @@ import java.io.InputStream;
 import java.net.URL;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.util.List;
 import java.util.Objects;
+import java.util.stream.Collectors;
 import org.apache.lucene.store.ByteBuffersDirectory;
 import org.apache.lucene.util.IOUtils;
 import org.junit.Test;
@ -46,6 +48,10 @@ public class SpellCheckerTest extends StemmerTestBase {
    doTest("allcaps");
  }

+  public void rep() throws Exception {
+    doTest("rep");
+  }
+
  @Test
  public void forceUCase() throws Exception {
    doTest("forceucase");
@ -178,10 +184,22 @@ public class SpellCheckerTest extends StemmerTestBase {
    }

    URL wrong = StemmerTestBase.class.getResource(name + ".wrong");
+    URL sug = StemmerTestBase.class.getResource(name + ".sug");
    if (wrong != null) {
-      for (String word : Files.readAllLines(Path.of(wrong.toURI()))) {
+      List<String> wrongWords = Files.readAllLines(Path.of(wrong.toURI()));
+      for (String word : wrongWords) {
        assertFalse("Unexpectedly considered correct: " + word, speller.spell(word));
      }
+      if (sug != null) {
+        String suggestions =
+            wrongWords.stream()
+                .map(s -> String.join(", ", speller.suggest(s)))
+                .filter(s -> !s.isEmpty())
+                .collect(Collectors.joining("\n"));
+        assertEquals(Files.readString(Path.of(sug.toURI())).trim(), suggestions);
+      }
+    } else {
+      assertNull(".sug file without .wrong file!", sug);
    }
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.sug
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.sug
@ -0,0 +1,5 @@
+scott
+scot-free
+foo-bar
+foo-foo-bar
+foo-foo-foo
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/rep.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/rep.aff
@ -0,0 +1,21 @@
+# With REP suggestions, we can fix typical language specific misspellings.
+
+# switch off ngram suggestion for testing
+MAXNGRAMSUGS 0
+
+REP 8
+REP f ph
+REP ph f
+REP shun$ tion
+REP ^alot$ a_lot  # add the highest priority for "a lot" suggestion to "alot"
+REP ^foo$ bar
+REP ' _    # "un'alunno" -> "un alunno"
+REP ^vinte<74>n$ vinte_e_un
+REP s 's
+
+
+SFX A Y 1
+SFX A 0 's .
+
+
+WORDCHARS '
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/rep.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/rep.dic
@ -0,0 +1,15 @@
+10
+form
+phantom
+vacation
+vacations
+a
+lot
+un
+alunno
+bar
+barbars
+vinte
+e
+un
+auto/A
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/rep.sug
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/rep.sug
@ -0,0 +1,8 @@
+form
+phantom
+vacation
+a lot, lot
+un alunno
+bar
+vinte e un
+auto's, auto
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/rep.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/rep.wrong
@ -0,0 +1,11 @@
+phorm
+fantom
+vacashun
+vacashuns
+alot
+un'alunno
+foo
+foobars
+barfoos
+vinteún
+autos