mirror of https://github.com/apache/lucene.git
LUCENE-9732: Hunspell: support dictionary entries starting with slash (#2301)
This commit is contained in:
parent
2f6807cc76
commit
825d8dbfd9
|
@ -832,7 +832,7 @@ public class Dictionary {
|
||||||
if (ch == '\\' && i + 1 < entry.length()) {
|
if (ch == '\\' && i + 1 < entry.length()) {
|
||||||
sb.append(entry.charAt(i + 1));
|
sb.append(entry.charAt(i + 1));
|
||||||
i++;
|
i++;
|
||||||
} else if (ch == '/') {
|
} else if (ch == '/' && i > 0) {
|
||||||
sb.append(FLAG_SEPARATOR);
|
sb.append(FLAG_SEPARATOR);
|
||||||
} else if (!shouldSkipEscapedChar(ch)) {
|
} else if (!shouldSkipEscapedChar(ch)) {
|
||||||
sb.append(ch);
|
sb.append(ch);
|
||||||
|
@ -902,10 +902,7 @@ public class Dictionary {
|
||||||
String line;
|
String line;
|
||||||
while ((line = lines.readLine()) != null) {
|
while ((line = lines.readLine()) != null) {
|
||||||
// wild and unpredictable code comment rules
|
// wild and unpredictable code comment rules
|
||||||
if (line.isEmpty()
|
if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') {
|
||||||
|| line.charAt(0) == '/'
|
|
||||||
|| line.charAt(0) == '#'
|
|
||||||
|| line.charAt(0) == '\t') {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
line = unescapeEntry(line);
|
line = unescapeEntry(line);
|
||||||
|
|
|
@ -17,13 +17,23 @@
|
||||||
package org.apache.lucene.analysis.hunspell;
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
enum WordCase {
|
enum WordCase {
|
||||||
|
/** e.g. WORD */
|
||||||
UPPER,
|
UPPER,
|
||||||
|
|
||||||
|
/** e.g. Word */
|
||||||
TITLE,
|
TITLE,
|
||||||
|
|
||||||
|
/** e.g. word */
|
||||||
LOWER,
|
LOWER,
|
||||||
MIXED;
|
|
||||||
|
/** e.g. WoRd or wOrd */
|
||||||
|
MIXED,
|
||||||
|
|
||||||
|
/** e.g "-" or "/" or "42" */
|
||||||
|
NEUTRAL;
|
||||||
|
|
||||||
static WordCase caseOf(char[] word, int length) {
|
static WordCase caseOf(char[] word, int length) {
|
||||||
boolean startsWithLower = Character.isLowerCase(word[0]);
|
CharCase startCase = charCase(word[0]);
|
||||||
|
|
||||||
boolean seenUpper = false;
|
boolean seenUpper = false;
|
||||||
boolean seenLower = false;
|
boolean seenLower = false;
|
||||||
|
@ -34,7 +44,7 @@ enum WordCase {
|
||||||
if (seenUpper && seenLower) break;
|
if (seenUpper && seenLower) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
return get(startsWithLower, seenUpper, seenLower);
|
return get(startCase, seenUpper, seenLower);
|
||||||
}
|
}
|
||||||
|
|
||||||
static WordCase caseOf(CharSequence word) {
|
static WordCase caseOf(CharSequence word) {
|
||||||
|
@ -42,7 +52,7 @@ enum WordCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
static WordCase caseOf(CharSequence word, int length) {
|
static WordCase caseOf(CharSequence word, int length) {
|
||||||
boolean startsWithLower = Character.isLowerCase(word.charAt(0));
|
CharCase startCase = charCase(word.charAt(0));
|
||||||
|
|
||||||
boolean seenUpper = false;
|
boolean seenUpper = false;
|
||||||
boolean seenLower = false;
|
boolean seenLower = false;
|
||||||
|
@ -53,14 +63,19 @@ enum WordCase {
|
||||||
if (seenUpper && seenLower) break;
|
if (seenUpper && seenLower) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
return get(startsWithLower, seenUpper, seenLower);
|
return get(startCase, seenUpper, seenLower);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static WordCase get(boolean startsWithLower, boolean seenUpper, boolean seenLower) {
|
private static WordCase get(CharCase startCase, boolean seenUpper, boolean seenLower) {
|
||||||
if (!startsWithLower) {
|
if (seenLower && seenUpper) return MIXED;
|
||||||
return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED;
|
switch (startCase) {
|
||||||
|
case LOWER:
|
||||||
|
return seenUpper ? MIXED : LOWER;
|
||||||
|
case UPPER:
|
||||||
|
return !seenLower ? UPPER : TITLE;
|
||||||
|
default:
|
||||||
|
return seenLower ? LOWER : seenUpper ? UPPER : NEUTRAL;
|
||||||
}
|
}
|
||||||
return seenUpper ? MIXED : LOWER;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static CharCase charCase(char c) {
|
private static CharCase charCase(char c) {
|
||||||
|
|
|
@ -25,11 +25,16 @@ public class TestEscaped extends StemmerTestBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testStemming() {
|
public void testStemming() {
|
||||||
|
assertStemsTo("/", "/");
|
||||||
assertStemsTo("works", "work");
|
assertStemsTo("works", "work");
|
||||||
assertStemsTo("work", "work");
|
assertStemsTo("work", "work");
|
||||||
assertStemsTo("R2/D2", "R2/D2", "R2/d2");
|
assertStemsTo("R2/D2", "R2/D2", "R2/d2");
|
||||||
assertStemsTo("R2/D2s", "R2/D2");
|
assertStemsTo("R2/D2s", "R2/D2");
|
||||||
assertStemsTo("N/A", "N/A");
|
assertStemsTo("N/A", "N/A");
|
||||||
assertStemsTo("N/As");
|
assertStemsTo("N/As");
|
||||||
|
|
||||||
|
assertStemsTo("/", "/");
|
||||||
|
assertStemsTo("/a", "/a");
|
||||||
|
assertStemsTo("//");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
3
|
5
|
||||||
|
/
|
||||||
|
/a
|
||||||
work/A
|
work/A
|
||||||
R2\/D2/A
|
R2\/D2/A
|
||||||
N\/A
|
N\/A
|
||||||
|
|
Loading…
Reference in New Issue