LUCENE-9732: Hunspell: support dictionary entries starting with slash (#2301)

This commit is contained in:
Peter Gromov 2021-02-05 11:25:32 +01:00 committed by GitHub
parent 2f6807cc76
commit 825d8dbfd9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 34 additions and 15 deletions

View File

@ -832,7 +832,7 @@ public class Dictionary {
if (ch == '\\' && i + 1 < entry.length()) {
sb.append(entry.charAt(i + 1));
i++;
} else if (ch == '/') {
} else if (ch == '/' && i > 0) {
sb.append(FLAG_SEPARATOR);
} else if (!shouldSkipEscapedChar(ch)) {
sb.append(ch);
@ -902,10 +902,7 @@ public class Dictionary {
String line;
while ((line = lines.readLine()) != null) {
// wild and unpredictable code comment rules
if (line.isEmpty()
|| line.charAt(0) == '/'
|| line.charAt(0) == '#'
|| line.charAt(0) == '\t') {
if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') {
continue;
}
line = unescapeEntry(line);

View File

@ -17,13 +17,23 @@
package org.apache.lucene.analysis.hunspell;
enum WordCase {
/** e.g. WORD */
UPPER,
/** e.g. Word */
TITLE,
/** e.g. word */
LOWER,
MIXED;
/** e.g. WoRd or wOrd */
MIXED,
/** e.g "-" or "/" or "42" */
NEUTRAL;
static WordCase caseOf(char[] word, int length) {
boolean startsWithLower = Character.isLowerCase(word[0]);
CharCase startCase = charCase(word[0]);
boolean seenUpper = false;
boolean seenLower = false;
@ -34,7 +44,7 @@ enum WordCase {
if (seenUpper && seenLower) break;
}
return get(startsWithLower, seenUpper, seenLower);
return get(startCase, seenUpper, seenLower);
}
static WordCase caseOf(CharSequence word) {
@ -42,7 +52,7 @@ enum WordCase {
}
static WordCase caseOf(CharSequence word, int length) {
boolean startsWithLower = Character.isLowerCase(word.charAt(0));
CharCase startCase = charCase(word.charAt(0));
boolean seenUpper = false;
boolean seenLower = false;
@ -53,14 +63,19 @@ enum WordCase {
if (seenUpper && seenLower) break;
}
return get(startsWithLower, seenUpper, seenLower);
return get(startCase, seenUpper, seenLower);
}
private static WordCase get(boolean startsWithLower, boolean seenUpper, boolean seenLower) {
if (!startsWithLower) {
return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED;
private static WordCase get(CharCase startCase, boolean seenUpper, boolean seenLower) {
if (seenLower && seenUpper) return MIXED;
switch (startCase) {
case LOWER:
return seenUpper ? MIXED : LOWER;
case UPPER:
return !seenLower ? UPPER : TITLE;
default:
return seenLower ? LOWER : seenUpper ? UPPER : NEUTRAL;
}
return seenUpper ? MIXED : LOWER;
}
private static CharCase charCase(char c) {

View File

@ -25,11 +25,16 @@ public class TestEscaped extends StemmerTestBase {
}
public void testStemming() {
assertStemsTo("/", "/");
assertStemsTo("works", "work");
assertStemsTo("work", "work");
assertStemsTo("R2/D2", "R2/D2", "R2/d2");
assertStemsTo("R2/D2s", "R2/D2");
assertStemsTo("N/A", "N/A");
assertStemsTo("N/As");
assertStemsTo("/", "/");
assertStemsTo("/a", "/a");
assertStemsTo("//");
}
}

View File

@ -1,4 +1,6 @@
3
5
/
/a
work/A
R2\/D2/A
N\/A