Hunspell: reduce suggestion set dependency on the hash table order (#12239)

* Hunspell: reduce suggestion set dependency on the hash table order When adding words to a dictionary, suggestions for other words shouldn't change unless they're directly related to the added words. But before, GeneratingSuggester selected 100 best first matches from the hash table, whose order can change significantly after adding any unrelated word. That resulted in unexpected suggestion changes on seemingly unrelated dictionary edits.
2023-04-23 16:51:17 +02:00 · 2023-04-23 16:51:17 +02:00 · 025dfec2dd
parent 2e7426961b
commit 025dfec2dd
4 changed files with 51 additions and 7 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -104,6 +104,8 @@ Other

 * GITHUB#11960: Hunspell: supported empty dictionaries (Peter Gromov)

+* GITHUB#12239: Hunspell: reduced suggestion set dependency on the hash table order (Peter Gromov)
+
 ======================== Lucene 9.6.0 =======================

 API Changes
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@ -98,7 +98,7 @@ class GeneratingSuggester {
          sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);

          boolean overflow = roots.size() == MAX_ROOTS;
-          if (overflow && sc <= roots.peek().score) {
+          if (overflow && isWorseThan(sc, rootChars, roots.peek())) {
            return;
          }

@ -119,6 +119,11 @@ class GeneratingSuggester {
    return roots.stream().sorted().collect(Collectors.toList());
  }

+  private static boolean isWorseThan(int score, CharsRef candidate, Weighted<Root<String>> root) {
+    return score < root.score
+        || score == root.score && CharSequence.compare(candidate, root.word.word) > 0;
+  }
+
  private void processSuggestibleWords(
      int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
    if (entryCache != null) {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/StemmerTestBase.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/StemmerTestBase.java
@ -65,13 +65,15 @@ public abstract class StemmerTestBase extends LuceneTestCase {
      }
    }

+    return loadDictionary(ignoreCase, affixStream, dictStreams);
+  }
+
+  protected static Dictionary loadDictionary(
+      boolean ignoreCase, InputStream affixStream, InputStream... dictStreams)
+      throws IOException, ParseException {
    try {
-      return new Dictionary(
-          new ByteBuffersDirectory(),
-          "dictionary",
-          affixStream,
-          Arrays.asList(dictStreams),
-          ignoreCase);
+      ByteBuffersDirectory dir = new ByteBuffersDirectory();
+      return new Dictionary(dir, "dictionary", affixStream, Arrays.asList(dictStreams), ignoreCase);
    } finally {
      IOUtils.closeWhileHandlingException(affixStream);
      IOUtils.closeWhileHandlingException(dictStreams);
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java
@ -21,8 +21,11 @@ import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.NO_TIMEOUT;
 import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.RETURN_PARTIAL_RESULT;
 import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.THROW_EXCEPTION;

+import java.io.ByteArrayInputStream;
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.text.ParseException;
+import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
@ -235,4 +238,36 @@ public class TestHunspell extends LuceneTestCase {
  private void checkCompression(Hunspell h, String expected, String... words) {
    assertEquals(expected, h.compress(List.of(words)).internalsToString());
  }
+
+  @Test
+  public void testSuggestionOrderStabilityOnDictionaryEditing() throws IOException, ParseException {
+    String original = "some_word";
+
+    List<String> words = new ArrayList<>();
+    for (char c = 0; c < 65535; c++) {
+      if (Character.isLetter(c)) {
+        words.add(original + c);
+      }
+    }
+
+    String smallDict = "1\n" + String.join("\n", words.subList(0, words.size() / 4));
+    String largerDict = "1\n" + String.join("\n", words);
+    Dictionary small =
+        loadDictionary(
+            false,
+            new ByteArrayInputStream(new byte[0]),
+            new ByteArrayInputStream(smallDict.getBytes(StandardCharsets.UTF_8)));
+    Dictionary larger =
+        loadDictionary(
+            false,
+            new ByteArrayInputStream(new byte[0]),
+            new ByteArrayInputStream(largerDict.getBytes(StandardCharsets.UTF_8)));
+
+    assertFalse(new Hunspell(small).spell(original));
+
+    List<String> smallSug = new Hunspell(small).suggest(original);
+    List<String> largerSug = new Hunspell(larger).suggest(original);
+    assertEquals(smallSug.toString(), 4, smallSug.size());
+    assertEquals(smallSug, largerSug);
+  }
 }