hunspell: support empty dictionaries, adapt to the hunspell/C++ repo changes (#11960)

hunspell: support empty dictionaries, adapt to the hunspell/C++ repo changes
2022-11-22 18:23:45 +01:00 · 2022-11-22 18:23:45 +01:00 · 2ae8dd632e
parent 0e0a20d88e
commit 2ae8dd632e
8 changed files with 19 additions and 5 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -60,7 +60,7 @@ Improvements
 Optimizations
 ---------------------

-* GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance
+* GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance (Peter Gromov)

 Bug Fixes
 ---------------------
@ -90,6 +90,8 @@ Other

 * GITHUB#977, LUCENE-9500: Remove the deflater hack introduced because of JDK-8252739 (Uwe Schindler)

+* GITHUB#11960: Hunspell: supported empty dictionaries (Peter Gromov)
+
 ======================== Lucene 9.5.0 =======================

 API Changes
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
@ -424,10 +424,13 @@ class WordStorage {
    }

    WordStorage build() throws IOException {
-      assert !group.isEmpty() : "build() should be only called once";
-      flushGroup();
+      if (hashTable.length > 0) {
+        assert !group.isEmpty() : "build() should be only called once";
+        flushGroup();
+      }
      byte[] trimmedData = ArrayUtil.copyOfSubArray(wordData, 0, dataWriter.getPosition());
-      return new WordStorage(maxEntryLength, hasCustomMorphData, hashTable, trimmedData);
+      int[] table = hashTable.length == 0 ? new int[1] : hashTable;
+      return new WordStorage(maxEntryLength, hasCustomMorphData, table, trimmedData);
    }
  }

--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellRepositoryTestCases.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellRepositoryTestCases.java
@ -42,7 +42,6 @@ public class TestHunspellRepositoryTestCases {
          "hu", // Hungarian is hard: a lot of its rules are hardcoded in Hunspell code, not aff/dic
          "morph", // we don't do morphological analysis yet
          "opentaal_keepcase", // Hunspell bug: https://github.com/hunspell/hunspell/issues/712
-          "forbiddenword", // needs https://github.com/hunspell/hunspell/pull/713 PR to be merged
          "nepali", // not supported yet
          "utf8_nonbmp", // code points not supported yet
          "phone" // not supported yet, used only for suggestions in en_ZA
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
@ -37,6 +37,10 @@ import org.apache.lucene.util.IOUtils;

 public class TestSpellChecking extends LuceneTestCase {

+  public void testEmpty() throws Exception {
+    doTest("empty");
+  }
+
  public void testBase() throws Exception {
    doTest("base");
  }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/empty.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/empty.aff
@ -0,0 +1,2 @@
+AF 2000
+INVALID something
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/empty.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/empty.dic
@ -0,0 +1 @@
+dummy
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/empty.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/empty.good
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/empty.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/empty.wrong
@ -0,0 +1,3 @@
+everything
+is
+wrong