LUCENE-9605: update snowball to d8cf01ddf37a, adds Yiddish (#2077)

This commit is contained in:
Robert Muir 2020-11-14 09:27:08 -05:00 committed by GitHub
parent 93ecd0fa0a
commit 52f581e351
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 1487 additions and 150 deletions

View File

@ -31,11 +31,11 @@ configure(rootProject) {
configure(project(":lucene:analysis:common")) { configure(project(":lucene:analysis:common")) {
ext { ext {
// git commit hash of source code https://github.com/snowballstem/snowball/ // git commit hash of source code https://github.com/snowballstem/snowball/
snowballStemmerCommit = "53739a805cfa6c77ff8496dc711dc1c106d987c1" snowballStemmerCommit = "d8cf01ddf37a9c74a78ada44531c08f7952f2a39"
// git commit hash of stopwords https://github.com/snowballstem/snowball-website // git commit hash of stopwords https://github.com/snowballstem/snowball-website
snowballWebsiteCommit = "5a8cf2451d108217585d8e32d744f8b8fd20c711" snowballWebsiteCommit = "ee7cee9bc52f22802f21e94f42d887b0dfa7d2a8"
// git commit hash of test data https://github.com/snowballstem/snowball-data // git commit hash of test data https://github.com/snowballstem/snowball-data
snowballDataCommit = "9145f8732ec952c8a3d1066be251da198a8bc792" snowballDataCommit = "35461050d8f81e8aeac26e38f8a8dbf1afb82721"
snowballWorkDir = file("${buildDir}/snowball") snowballWorkDir = file("${buildDir}/snowball")

View File

@ -570,7 +570,7 @@ index 0000000..0cc2b60
+ ) + )
+) +)
diff --git a/compiler/generator_java.c b/compiler/generator_java.c diff --git a/compiler/generator_java.c b/compiler/generator_java.c
index 3a18db7..5909f87 100644 index 2958452..966adb4 100644
--- a/compiler/generator_java.c --- a/compiler/generator_java.c
+++ b/compiler/generator_java.c +++ b/compiler/generator_java.c
@@ -272,7 +272,7 @@ static void generate_AE(struct generator * g, struct node * p) { @@ -272,7 +272,7 @@ static void generate_AE(struct generator * g, struct node * p) {
@ -582,7 +582,7 @@ index 3a18db7..5909f87 100644
break; break;
} }
} }
@@ -1138,6 +1138,7 @@ static void generate_class_begin(struct generator * g) { @@ -1140,6 +1140,7 @@ static void generate_class_begin(struct generator * g) {
w(g, " {~+~N" w(g, " {~+~N"
"~N" "~N"
"~Mprivate static final long serialVersionUID = 1L;~N" "~Mprivate static final long serialVersionUID = 1L;~N"
@ -590,7 +590,7 @@ index 3a18db7..5909f87 100644
"~N"); "~N");
} }
@@ -1184,7 +1185,7 @@ static void generate_among_table(struct generator * g, struct among * x) { @@ -1186,7 +1187,7 @@ static void generate_among_table(struct generator * g, struct among * x) {
if (v->function != 0) { if (v->function != 0) {
w(g, ", \""); w(g, ", \"");
write_varname(g, v->function); write_varname(g, v->function);
@ -1013,7 +1013,7 @@ index 73a81a9..f7772d3 100644
public abstract boolean stem(); public abstract boolean stem();
diff --git a/libstemmer/modules.txt b/libstemmer/modules.txt diff --git a/libstemmer/modules.txt b/libstemmer/modules.txt
index cb39621..9fe141e 100644 index b8ec17a..d2c8e61 100644
--- a/libstemmer/modules.txt --- a/libstemmer/modules.txt
+++ b/libstemmer/modules.txt +++ b/libstemmer/modules.txt
@@ -10,11 +10,13 @@ @@ -10,11 +10,13 @@
@ -1030,7 +1030,7 @@ index cb39621..9fe141e 100644
finnish UTF_8,ISO_8859_1 finnish,fi,fin finnish UTF_8,ISO_8859_1 finnish,fi,fin
french UTF_8,ISO_8859_1 french,fr,fre,fra french UTF_8,ISO_8859_1 french,fr,fre,fra
german UTF_8,ISO_8859_1 german,de,ger,deu german UTF_8,ISO_8859_1 german,de,ger,deu
@@ -50,12 +52,12 @@ porter UTF_8,ISO_8859_1 porter english @@ -51,12 +53,12 @@ porter UTF_8,ISO_8859_1 porter english
# algorithms are: # algorithms are:
# #
# german2 - This is a slight modification of the german stemmer. # german2 - This is a slight modification of the german stemmer.

View File

@ -161,6 +161,8 @@ Improvements
* LUCENE-9450: Use BinaryDocValues for the taxonomy index instead of StoredFields. * LUCENE-9450: Use BinaryDocValues for the taxonomy index instead of StoredFields.
Add backwards compatibility tests for the taxonomy index. (Gautam Worah, Michael McCandless) Add backwards compatibility tests for the taxonomy index. (Gautam Worah, Michael McCandless)
* LUCENE-9605: Update snowball to d8cf01ddf37a, adds Yiddish stemmer. (Robert Muir)
Bug fixes Bug fixes
* LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while * LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while

View File

@ -2257,9 +2257,8 @@ private static final char g_ca[] = {119, 95, 23, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
private static final char g_rg[] = {1 }; private static final char g_rg[] = {1 };
private int I_p3;
private int I_p2;
private int I_p1; private int I_p1;
private boolean B_no_diacritics;
private boolean r_cyr_to_lat() { private boolean r_cyr_to_lat() {
@ -2516,7 +2515,7 @@ private boolean r_prelude() {
} }
private boolean r_mark_regions() { private boolean r_mark_regions() {
I_p3 = 0; B_no_diacritics = true;
int v_1 = cursor; int v_1 = cursor;
lab0: { lab0: {
golab1: while(true) golab1: while(true)
@ -2534,11 +2533,10 @@ private boolean r_mark_regions() {
} }
cursor++; cursor++;
} }
I_p3 = cursor; B_no_diacritics = false;
} }
cursor = v_1; cursor = v_1;
I_p1 = limit; I_p1 = limit;
I_p2 = 0;
int v_3 = cursor; int v_3 = cursor;
lab3: { lab3: {
golab4: while(true) golab4: while(true)
@ -2557,59 +2555,55 @@ private boolean r_mark_regions() {
cursor++; cursor++;
} }
I_p1 = cursor; I_p1 = cursor;
if (!(I_p1 < 2))
{
break lab3;
}
golab6: while(true)
{
lab7: {
if (!(out_grouping(g_v, 97, 117)))
{
break lab7;
}
break golab6;
}
if (cursor >= limit)
{
break lab3;
}
cursor++;
}
I_p1 = cursor;
} }
cursor = v_3; cursor = v_3;
int v_5 = cursor; int v_6 = cursor;
lab6: {
golab7: while(true)
{
lab8: { lab8: {
golab9: while(true)
{
lab10: {
if (!(eq_s("r"))) if (!(eq_s("r")))
{
break lab10;
}
break golab9;
}
if (cursor >= limit)
{ {
break lab8; break lab8;
} }
break golab7;
}
if (cursor >= limit)
{
break lab6;
}
cursor++; cursor++;
} }
I_p2 = cursor; lab11: {
if (!((I_p1 - I_p2) > 1)) int v_8 = cursor;
{
break lab6;
}
I_p1 = I_p2;
}
cursor = v_5;
if (!(I_p1 < 2))
{
return false;
}
lab9: {
int v_7 = cursor;
lab10: {
if (!(I_p1 == I_p2))
{
break lab10;
}
golab11: while(true)
{
lab12: { lab12: {
if (!(eq_s("r"))) if (!(cursor >= 2))
{ {
break lab12; break lab12;
} }
break golab11; break lab11;
}
if (cursor >= limit)
{
break lab10;
}
cursor++;
} }
cursor = v_8;
golab13: while(true) golab13: while(true)
{ {
lab14: { lab14: {
@ -2621,49 +2615,18 @@ private boolean r_mark_regions() {
} }
if (cursor >= limit) if (cursor >= limit)
{ {
break lab10; break lab8;
} }
cursor++; cursor++;
} }
break lab9;
} }
cursor = v_7; if (!((I_p1 - cursor) > 1))
if (!(I_p1 != I_p2))
{ {
return false; break lab8;
}
golab15: while(true)
{
lab16: {
if (!(in_grouping(g_v, 97, 117)))
{
break lab16;
}
break golab15;
}
if (cursor >= limit)
{
return false;
}
cursor++;
}
golab17: while(true)
{
lab18: {
if (!(out_grouping(g_v, 97, 117)))
{
break lab18;
}
break golab17;
}
if (cursor >= limit)
{
return false;
}
cursor++;
}
} }
I_p1 = cursor; I_p1 = cursor;
}
cursor = v_6;
return true; return true;
} }
@ -2675,14 +2638,6 @@ private boolean r_R1() {
return true; return true;
} }
private boolean r_R2() {
if (!(I_p3 == 0))
{
return false;
}
return true;
}
private boolean r_Step_1() { private boolean r_Step_1() {
int among_var; int among_var;
ket = cursor; ket = cursor;
@ -2712,7 +2667,7 @@ private boolean r_Step_1() {
slice_from("\u010Dajni"); slice_from("\u010Dajni");
break; break;
case 7: case 7:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
@ -2788,7 +2743,7 @@ private boolean r_Step_1() {
slice_from("du\u0161ni"); slice_from("du\u0161ni");
break; break;
case 31: case 31:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
@ -2855,7 +2810,7 @@ private boolean r_Step_1() {
slice_from("\u0161avi"); slice_from("\u0161avi");
break; break;
case 52: case 52:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
@ -2868,7 +2823,7 @@ private boolean r_Step_1() {
slice_from("a\u010Dka"); slice_from("a\u010Dka");
break; break;
case 55: case 55:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
@ -2878,7 +2833,7 @@ private boolean r_Step_1() {
slice_from("u\u0161ka"); slice_from("u\u0161ka");
break; break;
case 57: case 57:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
@ -2906,7 +2861,7 @@ private boolean r_Step_1() {
slice_from("ti\u010Dni"); slice_from("ti\u010Dni");
break; break;
case 65: case 65:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
@ -2931,7 +2886,7 @@ private boolean r_Step_1() {
slice_from("osti"); slice_from("osti");
break; break;
case 72: case 72:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
@ -2992,7 +2947,7 @@ private boolean r_Step_1() {
slice_from("a\u0161ni"); slice_from("a\u0161ni");
break; break;
case 91: case 91:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
@ -3377,308 +3332,308 @@ private boolean r_Step_2() {
slice_from("at"); slice_from("at");
break; break;
case 121: case 121:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("luc"); slice_from("luc");
break; break;
case 122: case 122:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("snj"); slice_from("snj");
break; break;
case 123: case 123:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("os"); slice_from("os");
break; break;
case 124: case 124:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("ac"); slice_from("ac");
break; break;
case 125: case 125:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("ec"); slice_from("ec");
break; break;
case 126: case 126:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("uc"); slice_from("uc");
break; break;
case 127: case 127:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("rosi"); slice_from("rosi");
break; break;
case 128: case 128:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("aca"); slice_from("aca");
break; break;
case 129: case 129:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("jas"); slice_from("jas");
break; break;
case 130: case 130:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("tas"); slice_from("tas");
break; break;
case 131: case 131:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("gas"); slice_from("gas");
break; break;
case 132: case 132:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("nas"); slice_from("nas");
break; break;
case 133: case 133:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("kas"); slice_from("kas");
break; break;
case 134: case 134:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("vas"); slice_from("vas");
break; break;
case 135: case 135:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("bas"); slice_from("bas");
break; break;
case 136: case 136:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("as"); slice_from("as");
break; break;
case 137: case 137:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("cin"); slice_from("cin");
break; break;
case 138: case 138:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("astaj"); slice_from("astaj");
break; break;
case 139: case 139:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("istaj"); slice_from("istaj");
break; break;
case 140: case 140:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("ostaj"); slice_from("ostaj");
break; break;
case 141: case 141:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("asta"); slice_from("asta");
break; break;
case 142: case 142:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("ista"); slice_from("ista");
break; break;
case 143: case 143:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("osta"); slice_from("osta");
break; break;
case 144: case 144:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("ava"); slice_from("ava");
break; break;
case 145: case 145:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("eva"); slice_from("eva");
break; break;
case 146: case 146:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("iva"); slice_from("iva");
break; break;
case 147: case 147:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("uva"); slice_from("uva");
break; break;
case 148: case 148:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("ova"); slice_from("ova");
break; break;
case 149: case 149:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("jeti"); slice_from("jeti");
break; break;
case 150: case 150:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("inj"); slice_from("inj");
break; break;
case 151: case 151:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("ist"); slice_from("ist");
break; break;
case 152: case 152:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("es"); slice_from("es");
break; break;
case 153: case 153:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("et"); slice_from("et");
break; break;
case 154: case 154:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("is"); slice_from("is");
break; break;
case 155: case 155:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("ir"); slice_from("ir");
break; break;
case 156: case 156:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("ur"); slice_from("ur");
break; break;
case 157: case 157:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("uj"); slice_from("uj");
break; break;
case 158: case 158:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("ni"); slice_from("ni");
break; break;
case 159: case 159:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("sn"); slice_from("sn");
break; break;
case 160: case 160:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("ta"); slice_from("ta");
break; break;
case 161: case 161:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("a"); slice_from("a");
break; break;
case 162: case 162:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("i"); slice_from("i");
break; break;
case 163: case 163:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
slice_from("e"); slice_from("e");
break; break;
case 164: case 164:
if (!r_R2()) if (!(B_no_diacritics))
{ {
return false; return false;
} }
@ -3706,9 +3661,7 @@ private boolean r_Step_3() {
public boolean stem() { public boolean stem() {
r_cyr_to_lat(); r_cyr_to_lat();
r_prelude(); r_prelude();
int v_3 = cursor;
r_mark_regions(); r_mark_regions();
cursor = v_3;
limit_backward = cursor; limit_backward = cursor;
cursor = limit; cursor = limit;
int v_4 = limit - cursor; int v_4 = limit - cursor;

File diff suppressed because it is too large Load Diff

View File

@ -30,3 +30,4 @@ Spanish
Swedish Swedish
Tamil Tamil
Turkish Turkish
Yiddish