mirror of https://github.com/apache/lucene.git
LUCENE-9732: Hunspell: support dictionary entries starting with slash (#2301)
This commit is contained in:
parent
2f6807cc76
commit
825d8dbfd9
|
@ -832,7 +832,7 @@ public class Dictionary {
|
|||
if (ch == '\\' && i + 1 < entry.length()) {
|
||||
sb.append(entry.charAt(i + 1));
|
||||
i++;
|
||||
} else if (ch == '/') {
|
||||
} else if (ch == '/' && i > 0) {
|
||||
sb.append(FLAG_SEPARATOR);
|
||||
} else if (!shouldSkipEscapedChar(ch)) {
|
||||
sb.append(ch);
|
||||
|
@ -902,10 +902,7 @@ public class Dictionary {
|
|||
String line;
|
||||
while ((line = lines.readLine()) != null) {
|
||||
// wild and unpredictable code comment rules
|
||||
if (line.isEmpty()
|
||||
|| line.charAt(0) == '/'
|
||||
|| line.charAt(0) == '#'
|
||||
|| line.charAt(0) == '\t') {
|
||||
if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') {
|
||||
continue;
|
||||
}
|
||||
line = unescapeEntry(line);
|
||||
|
|
|
@ -17,13 +17,23 @@
|
|||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
enum WordCase {
|
||||
/** e.g. WORD */
|
||||
UPPER,
|
||||
|
||||
/** e.g. Word */
|
||||
TITLE,
|
||||
|
||||
/** e.g. word */
|
||||
LOWER,
|
||||
MIXED;
|
||||
|
||||
/** e.g. WoRd or wOrd */
|
||||
MIXED,
|
||||
|
||||
/** e.g "-" or "/" or "42" */
|
||||
NEUTRAL;
|
||||
|
||||
static WordCase caseOf(char[] word, int length) {
|
||||
boolean startsWithLower = Character.isLowerCase(word[0]);
|
||||
CharCase startCase = charCase(word[0]);
|
||||
|
||||
boolean seenUpper = false;
|
||||
boolean seenLower = false;
|
||||
|
@ -34,7 +44,7 @@ enum WordCase {
|
|||
if (seenUpper && seenLower) break;
|
||||
}
|
||||
|
||||
return get(startsWithLower, seenUpper, seenLower);
|
||||
return get(startCase, seenUpper, seenLower);
|
||||
}
|
||||
|
||||
static WordCase caseOf(CharSequence word) {
|
||||
|
@ -42,7 +52,7 @@ enum WordCase {
|
|||
}
|
||||
|
||||
static WordCase caseOf(CharSequence word, int length) {
|
||||
boolean startsWithLower = Character.isLowerCase(word.charAt(0));
|
||||
CharCase startCase = charCase(word.charAt(0));
|
||||
|
||||
boolean seenUpper = false;
|
||||
boolean seenLower = false;
|
||||
|
@ -53,14 +63,19 @@ enum WordCase {
|
|||
if (seenUpper && seenLower) break;
|
||||
}
|
||||
|
||||
return get(startsWithLower, seenUpper, seenLower);
|
||||
return get(startCase, seenUpper, seenLower);
|
||||
}
|
||||
|
||||
private static WordCase get(boolean startsWithLower, boolean seenUpper, boolean seenLower) {
|
||||
if (!startsWithLower) {
|
||||
return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED;
|
||||
private static WordCase get(CharCase startCase, boolean seenUpper, boolean seenLower) {
|
||||
if (seenLower && seenUpper) return MIXED;
|
||||
switch (startCase) {
|
||||
case LOWER:
|
||||
return seenUpper ? MIXED : LOWER;
|
||||
case UPPER:
|
||||
return !seenLower ? UPPER : TITLE;
|
||||
default:
|
||||
return seenLower ? LOWER : seenUpper ? UPPER : NEUTRAL;
|
||||
}
|
||||
return seenUpper ? MIXED : LOWER;
|
||||
}
|
||||
|
||||
private static CharCase charCase(char c) {
|
||||
|
|
|
@ -25,11 +25,16 @@ public class TestEscaped extends StemmerTestBase {
|
|||
}
|
||||
|
||||
public void testStemming() {
|
||||
assertStemsTo("/", "/");
|
||||
assertStemsTo("works", "work");
|
||||
assertStemsTo("work", "work");
|
||||
assertStemsTo("R2/D2", "R2/D2", "R2/d2");
|
||||
assertStemsTo("R2/D2s", "R2/D2");
|
||||
assertStemsTo("N/A", "N/A");
|
||||
assertStemsTo("N/As");
|
||||
|
||||
assertStemsTo("/", "/");
|
||||
assertStemsTo("/a", "/a");
|
||||
assertStemsTo("//");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
3
|
||||
5
|
||||
/
|
||||
/a
|
||||
work/A
|
||||
R2\/D2/A
|
||||
N\/A
|
||||
|
|
Loading…
Reference in New Issue