From 825d8dbfd95c64528642381ecf7cf28f169d1088 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Fri, 5 Feb 2021 11:25:32 +0100 Subject: [PATCH] LUCENE-9732: Hunspell: support dictionary entries starting with slash (#2301) --- .../lucene/analysis/hunspell/Dictionary.java | 7 ++-- .../lucene/analysis/hunspell/WordCase.java | 33 ++++++++++++++----- .../lucene/analysis/hunspell/TestEscaped.java | 5 +++ .../lucene/analysis/hunspell/escaped.dic | 4 ++- 4 files changed, 34 insertions(+), 15 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index c4d902ef425..048f9c6ecaf 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -832,7 +832,7 @@ public class Dictionary { if (ch == '\\' && i + 1 < entry.length()) { sb.append(entry.charAt(i + 1)); i++; - } else if (ch == '/') { + } else if (ch == '/' && i > 0) { sb.append(FLAG_SEPARATOR); } else if (!shouldSkipEscapedChar(ch)) { sb.append(ch); @@ -902,10 +902,7 @@ public class Dictionary { String line; while ((line = lines.readLine()) != null) { // wild and unpredictable code comment rules - if (line.isEmpty() - || line.charAt(0) == '/' - || line.charAt(0) == '#' - || line.charAt(0) == '\t') { + if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') { continue; } line = unescapeEntry(line); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java index 1499ee46ca0..434eefe220e 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java @@ -17,13 +17,23 @@ package org.apache.lucene.analysis.hunspell; enum WordCase { + /** e.g. WORD */ UPPER, + + /** e.g. Word */ TITLE, + + /** e.g. word */ LOWER, - MIXED; + + /** e.g. WoRd or wOrd */ + MIXED, + + /** e.g "-" or "/" or "42" */ + NEUTRAL; static WordCase caseOf(char[] word, int length) { - boolean startsWithLower = Character.isLowerCase(word[0]); + CharCase startCase = charCase(word[0]); boolean seenUpper = false; boolean seenLower = false; @@ -34,7 +44,7 @@ enum WordCase { if (seenUpper && seenLower) break; } - return get(startsWithLower, seenUpper, seenLower); + return get(startCase, seenUpper, seenLower); } static WordCase caseOf(CharSequence word) { @@ -42,7 +52,7 @@ enum WordCase { } static WordCase caseOf(CharSequence word, int length) { - boolean startsWithLower = Character.isLowerCase(word.charAt(0)); + CharCase startCase = charCase(word.charAt(0)); boolean seenUpper = false; boolean seenLower = false; @@ -53,14 +63,19 @@ enum WordCase { if (seenUpper && seenLower) break; } - return get(startsWithLower, seenUpper, seenLower); + return get(startCase, seenUpper, seenLower); } - private static WordCase get(boolean startsWithLower, boolean seenUpper, boolean seenLower) { - if (!startsWithLower) { - return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED; + private static WordCase get(CharCase startCase, boolean seenUpper, boolean seenLower) { + if (seenLower && seenUpper) return MIXED; + switch (startCase) { + case LOWER: + return seenUpper ? MIXED : LOWER; + case UPPER: + return !seenLower ? UPPER : TITLE; + default: + return seenLower ? LOWER : seenUpper ? UPPER : NEUTRAL; } - return seenUpper ? MIXED : LOWER; } private static CharCase charCase(char c) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java index 3038385665f..93b96cc9aa5 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java @@ -25,11 +25,16 @@ public class TestEscaped extends StemmerTestBase { } public void testStemming() { + assertStemsTo("/", "/"); assertStemsTo("works", "work"); assertStemsTo("work", "work"); assertStemsTo("R2/D2", "R2/D2", "R2/d2"); assertStemsTo("R2/D2s", "R2/D2"); assertStemsTo("N/A", "N/A"); assertStemsTo("N/As"); + + assertStemsTo("/", "/"); + assertStemsTo("/a", "/a"); + assertStemsTo("//"); } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.dic index 93602944260..ac4d4806069 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.dic +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.dic @@ -1,4 +1,6 @@ -3 +5 +/ +/a work/A R2\/D2/A N\/A