LUCENE-9732: Hunspell: support dictionary entries starting with slash (#2301)

This commit is contained in:
Peter Gromov 2021-02-05 11:25:32 +01:00 committed by GitHub
parent 2f6807cc76
commit 825d8dbfd9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 34 additions and 15 deletions

View File

@ -832,7 +832,7 @@ public class Dictionary {
if (ch == '\\' && i + 1 < entry.length()) { if (ch == '\\' && i + 1 < entry.length()) {
sb.append(entry.charAt(i + 1)); sb.append(entry.charAt(i + 1));
i++; i++;
} else if (ch == '/') { } else if (ch == '/' && i > 0) {
sb.append(FLAG_SEPARATOR); sb.append(FLAG_SEPARATOR);
} else if (!shouldSkipEscapedChar(ch)) { } else if (!shouldSkipEscapedChar(ch)) {
sb.append(ch); sb.append(ch);
@ -902,10 +902,7 @@ public class Dictionary {
String line; String line;
while ((line = lines.readLine()) != null) { while ((line = lines.readLine()) != null) {
// wild and unpredictable code comment rules // wild and unpredictable code comment rules
if (line.isEmpty() if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') {
|| line.charAt(0) == '/'
|| line.charAt(0) == '#'
|| line.charAt(0) == '\t') {
continue; continue;
} }
line = unescapeEntry(line); line = unescapeEntry(line);

View File

@ -17,13 +17,23 @@
package org.apache.lucene.analysis.hunspell; package org.apache.lucene.analysis.hunspell;
enum WordCase { enum WordCase {
/** e.g. WORD */
UPPER, UPPER,
/** e.g. Word */
TITLE, TITLE,
/** e.g. word */
LOWER, LOWER,
MIXED;
/** e.g. WoRd or wOrd */
MIXED,
/** e.g "-" or "/" or "42" */
NEUTRAL;
static WordCase caseOf(char[] word, int length) { static WordCase caseOf(char[] word, int length) {
boolean startsWithLower = Character.isLowerCase(word[0]); CharCase startCase = charCase(word[0]);
boolean seenUpper = false; boolean seenUpper = false;
boolean seenLower = false; boolean seenLower = false;
@ -34,7 +44,7 @@ enum WordCase {
if (seenUpper && seenLower) break; if (seenUpper && seenLower) break;
} }
return get(startsWithLower, seenUpper, seenLower); return get(startCase, seenUpper, seenLower);
} }
static WordCase caseOf(CharSequence word) { static WordCase caseOf(CharSequence word) {
@ -42,7 +52,7 @@ enum WordCase {
} }
static WordCase caseOf(CharSequence word, int length) { static WordCase caseOf(CharSequence word, int length) {
boolean startsWithLower = Character.isLowerCase(word.charAt(0)); CharCase startCase = charCase(word.charAt(0));
boolean seenUpper = false; boolean seenUpper = false;
boolean seenLower = false; boolean seenLower = false;
@ -53,14 +63,19 @@ enum WordCase {
if (seenUpper && seenLower) break; if (seenUpper && seenLower) break;
} }
return get(startsWithLower, seenUpper, seenLower); return get(startCase, seenUpper, seenLower);
} }
private static WordCase get(boolean startsWithLower, boolean seenUpper, boolean seenLower) { private static WordCase get(CharCase startCase, boolean seenUpper, boolean seenLower) {
if (!startsWithLower) { if (seenLower && seenUpper) return MIXED;
return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED; switch (startCase) {
case LOWER:
return seenUpper ? MIXED : LOWER;
case UPPER:
return !seenLower ? UPPER : TITLE;
default:
return seenLower ? LOWER : seenUpper ? UPPER : NEUTRAL;
} }
return seenUpper ? MIXED : LOWER;
} }
private static CharCase charCase(char c) { private static CharCase charCase(char c) {

View File

@ -25,11 +25,16 @@ public class TestEscaped extends StemmerTestBase {
} }
public void testStemming() { public void testStemming() {
assertStemsTo("/", "/");
assertStemsTo("works", "work"); assertStemsTo("works", "work");
assertStemsTo("work", "work"); assertStemsTo("work", "work");
assertStemsTo("R2/D2", "R2/D2", "R2/d2"); assertStemsTo("R2/D2", "R2/D2", "R2/d2");
assertStemsTo("R2/D2s", "R2/D2"); assertStemsTo("R2/D2s", "R2/D2");
assertStemsTo("N/A", "N/A"); assertStemsTo("N/A", "N/A");
assertStemsTo("N/As"); assertStemsTo("N/As");
assertStemsTo("/", "/");
assertStemsTo("/a", "/a");
assertStemsTo("//");
} }
} }

View File

@ -1,4 +1,6 @@
3 5
/
/a
work/A work/A
R2\/D2/A R2\/D2/A
N\/A N\/A