LUCENE-9732: Hunspell: support dictionary entries starting with slash (#2301)

2021-02-05 11:25:32 +01:00 · 2021-02-05 11:25:32 +01:00 · 825d8dbfd9
parent 2f6807cc76
commit 825d8dbfd9
4 changed files with 34 additions and 15 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -832,7 +832,7 @@ public class Dictionary {
      if (ch == '\\' && i + 1 < entry.length()) {
        sb.append(entry.charAt(i + 1));
        i++;
-      } else if (ch == '/') {
+      } else if (ch == '/' && i > 0) {
        sb.append(FLAG_SEPARATOR);
      } else if (!shouldSkipEscapedChar(ch)) {
        sb.append(ch);
@ -902,10 +902,7 @@ public class Dictionary {
        String line;
        while ((line = lines.readLine()) != null) {
          // wild and unpredictable code comment rules
-          if (line.isEmpty()
+          if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') {
              || line.charAt(0) == '/'
              || line.charAt(0) == '#'
              || line.charAt(0) == '\t') {
            continue;
          }
          line = unescapeEntry(line);
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
@ -17,13 +17,23 @@
 package org.apache.lucene.analysis.hunspell;
 enum WordCase {
  /** e.g. WORD */
  UPPER,
  /** e.g. Word */
  TITLE,
  /** e.g. word */
  LOWER,
-  MIXED;
+
  /** e.g. WoRd or wOrd */
  MIXED,
  /** e.g "-" or "/" or "42" */
  NEUTRAL;
  static WordCase caseOf(char[] word, int length) {
-    boolean startsWithLower = Character.isLowerCase(word[0]);
+    CharCase startCase = charCase(word[0]);
    boolean seenUpper = false;
    boolean seenLower = false;
@ -34,7 +44,7 @@ enum WordCase {
      if (seenUpper && seenLower) break;
    }
-    return get(startsWithLower, seenUpper, seenLower);
+    return get(startCase, seenUpper, seenLower);
  }
  static WordCase caseOf(CharSequence word) {
@ -42,7 +52,7 @@ enum WordCase {
  }
  static WordCase caseOf(CharSequence word, int length) {
-    boolean startsWithLower = Character.isLowerCase(word.charAt(0));
+    CharCase startCase = charCase(word.charAt(0));
    boolean seenUpper = false;
    boolean seenLower = false;
@ -53,14 +63,19 @@ enum WordCase {
      if (seenUpper && seenLower) break;
    }
-    return get(startsWithLower, seenUpper, seenLower);
+    return get(startCase, seenUpper, seenLower);
  }
-  private static WordCase get(boolean startsWithLower, boolean seenUpper, boolean seenLower) {
+  private static WordCase get(CharCase startCase, boolean seenUpper, boolean seenLower) {
-    if (!startsWithLower) {
+    if (seenLower && seenUpper) return MIXED;
-      return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED;
+    switch (startCase) {
      case LOWER:
        return seenUpper ? MIXED : LOWER;
      case UPPER:
        return !seenLower ? UPPER : TITLE;
      default:
        return seenLower ? LOWER : seenUpper ? UPPER : NEUTRAL;
    }
    return seenUpper ? MIXED : LOWER;
  }
  private static CharCase charCase(char c) {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java
@ -25,11 +25,16 @@ public class TestEscaped extends StemmerTestBase {
  }
  public void testStemming() {
    assertStemsTo("/", "/");
    assertStemsTo("works", "work");
    assertStemsTo("work", "work");
    assertStemsTo("R2/D2", "R2/D2", "R2/d2");
    assertStemsTo("R2/D2s", "R2/D2");
    assertStemsTo("N/A", "N/A");
    assertStemsTo("N/As");
    assertStemsTo("/", "/");
    assertStemsTo("/a", "/a");
    assertStemsTo("//");
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.dic
@ -1,4 +1,6 @@
-3
+5
 /
 /a
 work/A
 R2\/D2/A
 N\/A