Added an arabic snowball stemmer and test dataset

This change adds an Arabic snowball stemmer based on snowballstem.org
as well as an arabic test dataset in `TestSnowballVocabData.zip`
It also updates the `ant patch-snowball` target to be compatible with
the java classes generated by the last snowball version (tree:
1964ce688cbeca505263c8f77e16ed923296ce7a). The `ant patch-snowball` target
is retro-compatible with the version of snowball stemmers used in
lucene 7.x and ignores already patched classes.

Signed-off-by: Jim Ferenczi <jimczi@apache.org>
This commit is contained in:
Ryadh Dahimene 2018-09-13 13:26:34 +01:00 committed by Jim Ferenczi
parent 6f291d402b
commit 5c567d4fbc
5 changed files with 2000 additions and 25 deletions

View File

@ -226,6 +226,10 @@ New Features
* LUCENE-8538: Add a Simple WKT Shape Parser for creating Lucene Geometries (Polygon, Line, * LUCENE-8538: Add a Simple WKT Shape Parser for creating Lucene Geometries (Polygon, Line,
Rectangle) from WKT format. (Nick Knize) Rectangle) from WKT format. (Nick Knize)
* LUCENE-8462: Adds an Arabic snowball stemmer based on
https://github.com/snowballstem/snowball/blob/master/algorithms/arabic.sbl
(Ryadh Dahimene via Jim Ferenczi)
Improvements: Improvements:
* LUCENE-8521: Change LatLonShape encoding to 7 dimensions instead of 6; where the * LUCENE-8521: Change LatLonShape encoding to 7 dimensions instead of 6; where the

View File

@ -17,10 +17,10 @@
limitations under the License. limitations under the License.
--> -->
<project name="analyzers-common" default="default"> <project name="analyzers-common" default="default" xmlns:rsel="antlib:org.apache.tools.ant.types.resources.selectors">
<description> <description>
Analyzers for indexing content in different languages and domains. Analyzers for indexing content in different languages and domains.
</description> </description>
<!-- some files for testing that do not have license headers --> <!-- some files for testing that do not have license headers -->
@ -88,7 +88,7 @@
<target xmlns:ivy="antlib:org.apache.ivy.ant" name="-resolve-icu4j" unless="icu4j.resolved" depends="ivy-availability-check,ivy-configure"> <target xmlns:ivy="antlib:org.apache.ivy.ant" name="-resolve-icu4j" unless="icu4j.resolved" depends="ivy-availability-check,ivy-configure">
<loadproperties prefix="ivyversions" srcFile="${common.dir}/ivy-versions.properties"/> <loadproperties prefix="ivyversions" srcFile="${common.dir}/ivy-versions.properties"/>
<ivy:cachepath organisation="com.ibm.icu" module="icu4j" revision="${ivyversions./com.ibm.icu/icu4j}" <ivy:cachepath organisation="com.ibm.icu" module="icu4j" revision="${ivyversions./com.ibm.icu/icu4j}"
inline="true" conf="default" transitive="true" pathid="icu4j.classpath"/> inline="true" conf="default" transitive="true" pathid="icu4j.classpath"/>
<property name="icu4j.resolved" value="true"/> <property name="icu4j.resolved" value="true"/>
</target> </target>
@ -102,10 +102,10 @@
<target name="gen-tlds" depends="compile-tools"> <target name="gen-tlds" depends="compile-tools">
<java <java
classname="org.apache.lucene.analysis.standard.GenerateJflexTLDMacros" classname="org.apache.lucene.analysis.standard.GenerateJflexTLDMacros"
dir="." dir="."
fork="true" fork="true"
failonerror="true"> failonerror="true">
<classpath> <classpath>
<pathelement location="${build.dir}/classes/tools"/> <pathelement location="${build.dir}/classes/tools"/>
</classpath> </classpath>
@ -117,8 +117,8 @@
<target name="compile-tools" depends="common.compile-tools"> <target name="compile-tools" depends="common.compile-tools">
<compile <compile
srcdir="src/tools/java" srcdir="src/tools/java"
destdir="${build.dir}/classes/tools"> destdir="${build.dir}/classes/tools">
<classpath refid="classpath"/> <classpath refid="classpath"/>
</compile> </compile>
</target> </target>
@ -128,13 +128,71 @@
<target name="regenerate" depends="jflex,unicode-data"/> <target name="regenerate" depends="jflex,unicode-data"/>
<target name="patch-snowball" description="Patches all snowball programs in '${snowball.programs.dir}' to make them work with MethodHandles"> <target name="patch-snowball" description="Patches all snowball programs in '${snowball.programs.dir}' to make them work with MethodHandles">
<fileset id="snowball.programs" dir="${snowball.programs.dir}" includes="*Stemmer.java"/> <fileset id="snowball.programs" dir="${snowball.programs.dir}" includes="*Stemmer.java"/>
<replaceregexp match="^public class \w+Stemmer\b" replace="@SuppressWarnings(&quot;unused&quot;) \0" flags="m" encoding="UTF-8">
<replaceregexp match="^public class \w+Stemmer\b" replace="@SuppressWarnings(&quot;unused&quot;) \0" flags="m" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/> <fileset refid="snowball.programs"/>
</replaceregexp> <rsel:not>
<replaceregexp match="private final static \w+Stemmer methodObject\b.*$" replace="/* patched */ private static final java.lang.invoke.MethodHandles.Lookup methodObject = java.lang.invoke.MethodHandles.lookup();" flags="m" encoding="UTF-8"> <rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="new Among\(([^,]*,[^,]*,[^,]*?)(?=\))" replace="\0, &quot;&quot;, methodObject" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/> <fileset refid="snowball.programs"/>
</replaceregexp> <rsel:not>
<fixcrlf srcdir="${snowball.programs.dir}" includes="*Stemmer.java" tab="remove" tablength="2" encoding="UTF-8" javafiles="yes" fixlast="yes"/> <rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="(new Among\([^,]*,[^,]*,[^,]*,[^,]*,)[^,]*?(?=\))" replace="\1 methodObject" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="(?:find_among(?:|_b)\()(.*?)(?=\))" replace="\0, \1.length" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="current" replace="getCurrent()" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="(?:eq_s(?:|_b)\()(.*?)(?=\))" replace="\0.length(),\1" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="private static final long serialVersionUID(.*)" replace="private static final long serialVersionUID = 1L; ${line.separator}${line.separator} /* patched */ private static final java.lang.invoke.MethodHandles.Lookup methodObject = java.lang.invoke.MethodHandles.lookup();" flags="m" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<fixcrlf srcdir="${snowball.programs.dir}" includes="*Stemmer.java" tab="remove" tablength="2" encoding="UTF-8" javafiles="yes" fixlast="yes"/>
</target> </target>
</project> </project>

File diff suppressed because it is too large Load Diff

View File

@ -36,6 +36,7 @@ public class TestSnowballVocab extends LuceneTestCase {
* Run all languages against their snowball vocabulary tests. * Run all languages against their snowball vocabulary tests.
*/ */
public void testStemmers() throws IOException { public void testStemmers() throws IOException {
assertCorrectOutput("Arabic", "arabic");
assertCorrectOutput("Danish", "danish"); assertCorrectOutput("Danish", "danish");
assertCorrectOutput("Dutch", "dutch"); assertCorrectOutput("Dutch", "dutch");
assertCorrectOutput("English", "english"); assertCorrectOutput("English", "english");