LUCENE-9702: Hunspell: support alternate casing for short language codes (#2253)

2021-01-29 11:46:45 +01:00 · 2021-01-29 11:46:45 +01:00 · ff943ece8f
parent 6635d7a5e7
commit ff943ece8f
6 changed files with 28 additions and 1 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -374,7 +374,9 @@ public class Dictionary {
        fullStrip = true;
      } else if ("LANG".equals(firstWord)) {
        language = singleArgument(reader, line);
-        alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language);
+        int underscore = language.indexOf("_");
+        String langCode = underscore < 0 ? language : language.substring(0, underscore);
+        alternateCasing = langCode.equals("tr") || langCode.equals("az");
      } else if ("BREAK".equals(firstWord)) {
        breaks = parseBreaks(reader, line);
      } else if ("FORBIDDENWORD".equals(firstWord)) {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@ -56,6 +56,11 @@ public class SpellCheckerTest extends StemmerTestBase {
    doTest("i53643");
  }

+  @Test
+  public void dotless_i() throws Exception {
+    doTest("dotless_i");
+  }
+
  @Test
  public void needAffixOnAffixes() throws Exception {
    doTest("needaffix5");
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.aff
@ -0,0 +1,2 @@
+SET UTF-8
+LANG tr
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.dic
@ -0,0 +1,4 @@
+3
+iç
+ışık
+Diyarbakır
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.good
@ -0,0 +1,8 @@
+Diyarbakır
+DİYARBAKIR
+iç
+İç
+ışık
+Işık
+İÇ
+IŞIK
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dotless_i.wrong
@ -0,0 +1,6 @@
+Diyarbakir
+DIYARBAKIR
+Iç
+İşık
+IÇ
+İŞIK