Added an arabic snowball stemmer and test dataset

This change adds an Arabic snowball stemmer based on snowballstem.org
as well as an arabic test dataset in `TestSnowballVocabData.zip`
It also updates the `ant patch-snowball` target to be compatible with
the java classes generated by the last snowball version (tree:
1964ce688cbeca505263c8f77e16ed923296ce7a). The `ant patch-snowball` target
is retro-compatible with the version of snowball stemmers used in
lucene 7.x and ignores already patched classes.

Signed-off-by: Jim Ferenczi <jimczi@apache.org>
This commit is contained in:
Ryadh Dahimene 2018-09-13 13:26:34 +01:00 committed by Jim Ferenczi
parent 6f291d402b
commit 5c567d4fbc
5 changed files with 2000 additions and 25 deletions

View File

@ -226,6 +226,10 @@ New Features
* LUCENE-8538: Add a Simple WKT Shape Parser for creating Lucene Geometries (Polygon, Line,
Rectangle) from WKT format. (Nick Knize)
* LUCENE-8462: Adds an Arabic snowball stemmer based on
https://github.com/snowballstem/snowball/blob/master/algorithms/arabic.sbl
(Ryadh Dahimene via Jim Ferenczi)
Improvements:
* LUCENE-8521: Change LatLonShape encoding to 7 dimensions instead of 6; where the

View File

@ -17,7 +17,7 @@
limitations under the License.
-->
<project name="analyzers-common" default="default">
<project name="analyzers-common" default="default" xmlns:rsel="antlib:org.apache.tools.ant.types.resources.selectors">
<description>
Analyzers for indexing content in different languages and domains.
@ -129,12 +129,70 @@
<target name="patch-snowball" description="Patches all snowball programs in '${snowball.programs.dir}' to make them work with MethodHandles">
<fileset id="snowball.programs" dir="${snowball.programs.dir}" includes="*Stemmer.java"/>
<replaceregexp match="^public class \w+Stemmer\b" replace="@SuppressWarnings(&quot;unused&quot;) \0" flags="m" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="private final static \w+Stemmer methodObject\b.*$" replace="/* patched */ private static final java.lang.invoke.MethodHandles.Lookup methodObject = java.lang.invoke.MethodHandles.lookup();" flags="m" encoding="UTF-8">
<replaceregexp match="new Among\(([^,]*,[^,]*,[^,]*?)(?=\))" replace="\0, &quot;&quot;, methodObject" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="(new Among\([^,]*,[^,]*,[^,]*,[^,]*,)[^,]*?(?=\))" replace="\1 methodObject" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="(?:find_among(?:|_b)\()(.*?)(?=\))" replace="\0, \1.length" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="current" replace="getCurrent()" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="(?:eq_s(?:|_b)\()(.*?)(?=\))" replace="\0.length(),\1" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="private static final long serialVersionUID(.*)" replace="private static final long serialVersionUID = 1L; ${line.separator}${line.separator} /* patched */ private static final java.lang.invoke.MethodHandles.Lookup methodObject = java.lang.invoke.MethodHandles.lookup();" flags="m" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<fixcrlf srcdir="${snowball.programs.dir}" includes="*Stemmer.java" tab="remove" tablength="2" encoding="UTF-8" javafiles="yes" fixlast="yes"/>
</target>
</project>

File diff suppressed because it is too large Load Diff

View File

@ -36,6 +36,7 @@ public class TestSnowballVocab extends LuceneTestCase {
* Run all languages against their snowball vocabulary tests.
*/
public void testStemmers() throws IOException {
assertCorrectOutput("Arabic", "arabic");
assertCorrectOutput("Danish", "danish");
assertCorrectOutput("Dutch", "dutch");
assertCorrectOutput("English", "english");