LUCENE-9605: update snowball to d8cf01ddf37a, adds Yiddish (#2077)

This commit is contained in:
Robert Muir 2020-11-14 09:27:08 -05:00 committed by GitHub
parent 93ecd0fa0a
commit 52f581e351
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 1487 additions and 150 deletions

View File

@ -31,11 +31,11 @@ configure(rootProject) {
configure(project(":lucene:analysis:common")) {
ext {
// git commit hash of source code https://github.com/snowballstem/snowball/
snowballStemmerCommit = "53739a805cfa6c77ff8496dc711dc1c106d987c1"
snowballStemmerCommit = "d8cf01ddf37a9c74a78ada44531c08f7952f2a39"
// git commit hash of stopwords https://github.com/snowballstem/snowball-website
snowballWebsiteCommit = "5a8cf2451d108217585d8e32d744f8b8fd20c711"
snowballWebsiteCommit = "ee7cee9bc52f22802f21e94f42d887b0dfa7d2a8"
// git commit hash of test data https://github.com/snowballstem/snowball-data
snowballDataCommit = "9145f8732ec952c8a3d1066be251da198a8bc792"
snowballDataCommit = "35461050d8f81e8aeac26e38f8a8dbf1afb82721"
snowballWorkDir = file("${buildDir}/snowball")

View File

@ -570,7 +570,7 @@ index 0000000..0cc2b60
+ )
+)
diff --git a/compiler/generator_java.c b/compiler/generator_java.c
index 3a18db7..5909f87 100644
index 2958452..966adb4 100644
--- a/compiler/generator_java.c
+++ b/compiler/generator_java.c
@@ -272,7 +272,7 @@ static void generate_AE(struct generator * g, struct node * p) {
@ -582,7 +582,7 @@ index 3a18db7..5909f87 100644
break;
}
}
@@ -1138,6 +1138,7 @@ static void generate_class_begin(struct generator * g) {
@@ -1140,6 +1140,7 @@ static void generate_class_begin(struct generator * g) {
w(g, " {~+~N"
"~N"
"~Mprivate static final long serialVersionUID = 1L;~N"
@ -590,7 +590,7 @@ index 3a18db7..5909f87 100644
"~N");
}
@@ -1184,7 +1185,7 @@ static void generate_among_table(struct generator * g, struct among * x) {
@@ -1186,7 +1187,7 @@ static void generate_among_table(struct generator * g, struct among * x) {
if (v->function != 0) {
w(g, ", \"");
write_varname(g, v->function);
@ -1013,7 +1013,7 @@ index 73a81a9..f7772d3 100644
public abstract boolean stem();
diff --git a/libstemmer/modules.txt b/libstemmer/modules.txt
index cb39621..9fe141e 100644
index b8ec17a..d2c8e61 100644
--- a/libstemmer/modules.txt
+++ b/libstemmer/modules.txt
@@ -10,11 +10,13 @@
@ -1030,7 +1030,7 @@ index cb39621..9fe141e 100644
finnish UTF_8,ISO_8859_1 finnish,fi,fin
french UTF_8,ISO_8859_1 french,fr,fre,fra
german UTF_8,ISO_8859_1 german,de,ger,deu
@@ -50,12 +52,12 @@ porter UTF_8,ISO_8859_1 porter english
@@ -51,12 +53,12 @@ porter UTF_8,ISO_8859_1 porter english
# algorithms are:
#
# german2 - This is a slight modification of the german stemmer.

View File

@ -161,6 +161,8 @@ Improvements
* LUCENE-9450: Use BinaryDocValues for the taxonomy index instead of StoredFields.
Add backwards compatibility tests for the taxonomy index. (Gautam Worah, Michael McCandless)
* LUCENE-9605: Update snowball to d8cf01ddf37a, adds Yiddish stemmer. (Robert Muir)
Bug fixes
* LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while

View File

@ -2257,9 +2257,8 @@ private static final char g_ca[] = {119, 95, 23, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
private static final char g_rg[] = {1 };
private int I_p3;
private int I_p2;
private int I_p1;
private boolean B_no_diacritics;
private boolean r_cyr_to_lat() {
@ -2516,7 +2515,7 @@ private boolean r_prelude() {
}
private boolean r_mark_regions() {
I_p3 = 0;
B_no_diacritics = true;
int v_1 = cursor;
lab0: {
golab1: while(true)
@ -2534,11 +2533,10 @@ private boolean r_mark_regions() {
}
cursor++;
}
I_p3 = cursor;
B_no_diacritics = false;
}
cursor = v_1;
I_p1 = limit;
I_p2 = 0;
int v_3 = cursor;
lab3: {
golab4: while(true)
@ -2557,59 +2555,55 @@ private boolean r_mark_regions() {
cursor++;
}
I_p1 = cursor;
if (!(I_p1 < 2))
{
break lab3;
}
golab6: while(true)
{
lab7: {
if (!(out_grouping(g_v, 97, 117)))
{
break lab7;
}
break golab6;
}
if (cursor >= limit)
{
break lab3;
}
cursor++;
}
I_p1 = cursor;
}
cursor = v_3;
int v_5 = cursor;
lab6: {
golab7: while(true)
{
int v_6 = cursor;
lab8: {
golab9: while(true)
{
lab10: {
if (!(eq_s("r")))
{
break lab10;
}
break golab9;
}
if (cursor >= limit)
{
break lab8;
}
break golab7;
}
if (cursor >= limit)
{
break lab6;
}
cursor++;
}
I_p2 = cursor;
if (!((I_p1 - I_p2) > 1))
{
break lab6;
}
I_p1 = I_p2;
}
cursor = v_5;
if (!(I_p1 < 2))
{
return false;
}
lab9: {
int v_7 = cursor;
lab10: {
if (!(I_p1 == I_p2))
{
break lab10;
}
golab11: while(true)
{
lab11: {
int v_8 = cursor;
lab12: {
if (!(eq_s("r")))
if (!(cursor >= 2))
{
break lab12;
}
break golab11;
}
if (cursor >= limit)
{
break lab10;
}
cursor++;
break lab11;
}
cursor = v_8;
golab13: while(true)
{
lab14: {
@ -2621,49 +2615,18 @@ private boolean r_mark_regions() {
}
if (cursor >= limit)
{
break lab10;
break lab8;
}
cursor++;
}
break lab9;
}
cursor = v_7;
if (!(I_p1 != I_p2))
if (!((I_p1 - cursor) > 1))
{
return false;
}
golab15: while(true)
{
lab16: {
if (!(in_grouping(g_v, 97, 117)))
{
break lab16;
}
break golab15;
}
if (cursor >= limit)
{
return false;
}
cursor++;
}
golab17: while(true)
{
lab18: {
if (!(out_grouping(g_v, 97, 117)))
{
break lab18;
}
break golab17;
}
if (cursor >= limit)
{
return false;
}
cursor++;
}
break lab8;
}
I_p1 = cursor;
}
cursor = v_6;
return true;
}
@ -2675,14 +2638,6 @@ private boolean r_R1() {
return true;
}
private boolean r_R2() {
if (!(I_p3 == 0))
{
return false;
}
return true;
}
private boolean r_Step_1() {
int among_var;
ket = cursor;
@ -2712,7 +2667,7 @@ private boolean r_Step_1() {
slice_from("\u010Dajni");
break;
case 7:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
@ -2788,7 +2743,7 @@ private boolean r_Step_1() {
slice_from("du\u0161ni");
break;
case 31:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
@ -2855,7 +2810,7 @@ private boolean r_Step_1() {
slice_from("\u0161avi");
break;
case 52:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
@ -2868,7 +2823,7 @@ private boolean r_Step_1() {
slice_from("a\u010Dka");
break;
case 55:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
@ -2878,7 +2833,7 @@ private boolean r_Step_1() {
slice_from("u\u0161ka");
break;
case 57:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
@ -2906,7 +2861,7 @@ private boolean r_Step_1() {
slice_from("ti\u010Dni");
break;
case 65:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
@ -2931,7 +2886,7 @@ private boolean r_Step_1() {
slice_from("osti");
break;
case 72:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
@ -2992,7 +2947,7 @@ private boolean r_Step_1() {
slice_from("a\u0161ni");
break;
case 91:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
@ -3377,308 +3332,308 @@ private boolean r_Step_2() {
slice_from("at");
break;
case 121:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("luc");
break;
case 122:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("snj");
break;
case 123:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("os");
break;
case 124:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("ac");
break;
case 125:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("ec");
break;
case 126:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("uc");
break;
case 127:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("rosi");
break;
case 128:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("aca");
break;
case 129:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("jas");
break;
case 130:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("tas");
break;
case 131:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("gas");
break;
case 132:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("nas");
break;
case 133:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("kas");
break;
case 134:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("vas");
break;
case 135:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("bas");
break;
case 136:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("as");
break;
case 137:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("cin");
break;
case 138:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("astaj");
break;
case 139:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("istaj");
break;
case 140:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("ostaj");
break;
case 141:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("asta");
break;
case 142:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("ista");
break;
case 143:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("osta");
break;
case 144:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("ava");
break;
case 145:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("eva");
break;
case 146:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("iva");
break;
case 147:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("uva");
break;
case 148:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("ova");
break;
case 149:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("jeti");
break;
case 150:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("inj");
break;
case 151:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("ist");
break;
case 152:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("es");
break;
case 153:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("et");
break;
case 154:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("is");
break;
case 155:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("ir");
break;
case 156:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("ur");
break;
case 157:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("uj");
break;
case 158:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("ni");
break;
case 159:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("sn");
break;
case 160:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("ta");
break;
case 161:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("a");
break;
case 162:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("i");
break;
case 163:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
slice_from("e");
break;
case 164:
if (!r_R2())
if (!(B_no_diacritics))
{
return false;
}
@ -3706,9 +3661,7 @@ private boolean r_Step_3() {
public boolean stem() {
r_cyr_to_lat();
r_prelude();
int v_3 = cursor;
r_mark_regions();
cursor = v_3;
limit_backward = cursor;
cursor = limit;
int v_4 = limit - cursor;

File diff suppressed because it is too large Load Diff

View File

@ -30,3 +30,4 @@ Spanish
Swedish
Tamil
Turkish
Yiddish