Added an arabic snowball stemmer and test dataset

This change adds an Arabic snowball stemmer based on snowballstem.org
as well as an arabic test dataset in `TestSnowballVocabData.zip`
It also updates the `ant patch-snowball` target to be compatible with
the java classes generated by the last snowball version (tree:
1964ce688cbeca505263c8f77e16ed923296ce7a). The `ant patch-snowball` target
is retro-compatible with the version of snowball stemmers used in
lucene 7.x and ignores already patched classes.

Signed-off-by: Jim Ferenczi <jimczi@apache.org>
This commit is contained in:
Ryadh Dahimene 2018-09-13 13:26:34 +01:00 committed by Jim Ferenczi
parent 6f291d402b
commit 5c567d4fbc
5 changed files with 2000 additions and 25 deletions

View File

@ -226,6 +226,10 @@ New Features
* LUCENE-8538: Add a Simple WKT Shape Parser for creating Lucene Geometries (Polygon, Line, * LUCENE-8538: Add a Simple WKT Shape Parser for creating Lucene Geometries (Polygon, Line,
Rectangle) from WKT format. (Nick Knize) Rectangle) from WKT format. (Nick Knize)
* LUCENE-8462: Adds an Arabic snowball stemmer based on
https://github.com/snowballstem/snowball/blob/master/algorithms/arabic.sbl
(Ryadh Dahimene via Jim Ferenczi)
Improvements: Improvements:
* LUCENE-8521: Change LatLonShape encoding to 7 dimensions instead of 6; where the * LUCENE-8521: Change LatLonShape encoding to 7 dimensions instead of 6; where the

View File

@ -7,9 +7,9 @@
The ASF licenses this file to You under the Apache License, Version 2.0 The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -17,10 +17,10 @@
limitations under the License. limitations under the License.
--> -->
<project name="analyzers-common" default="default"> <project name="analyzers-common" default="default" xmlns:rsel="antlib:org.apache.tools.ant.types.resources.selectors">
<description> <description>
Analyzers for indexing content in different languages and domains. Analyzers for indexing content in different languages and domains.
</description> </description>
<!-- some files for testing that do not have license headers --> <!-- some files for testing that do not have license headers -->
@ -28,9 +28,9 @@
<property name="rat.additional-includes" value="src/tools/**"/> <property name="rat.additional-includes" value="src/tools/**"/>
<import file="../analysis-module-build.xml"/> <import file="../analysis-module-build.xml"/>
<property name="snowball.programs.dir" location="src/java/org/tartarus/snowball/ext"/> <property name="snowball.programs.dir" location="src/java/org/tartarus/snowball/ext"/>
<property name="unicode-props-file" location="src/java/org/apache/lucene/analysis/util/UnicodeProps.java"/> <property name="unicode-props-file" location="src/java/org/apache/lucene/analysis/util/UnicodeProps.java"/>
<target name="jflex" depends="-install-jflex,clean-jflex,-jflex-ClassicAnalyzer,-jflex-UAX29URLEmailTokenizer, <target name="jflex" depends="-install-jflex,clean-jflex,-jflex-ClassicAnalyzer,-jflex-UAX29URLEmailTokenizer,
@ -66,7 +66,7 @@
<run-jflex-and-disable-buffer-expansion <run-jflex-and-disable-buffer-expansion
dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/> dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
</target> </target>
<target name="-jflex-ClassicAnalyzer" depends="init,-install-jflex"> <target name="-jflex-ClassicAnalyzer" depends="init,-install-jflex">
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/> <run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
</target> </target>
@ -84,28 +84,28 @@
</fileset> </fileset>
</delete> </delete>
</target> </target>
<target xmlns:ivy="antlib:org.apache.ivy.ant" name="-resolve-icu4j" unless="icu4j.resolved" depends="ivy-availability-check,ivy-configure"> <target xmlns:ivy="antlib:org.apache.ivy.ant" name="-resolve-icu4j" unless="icu4j.resolved" depends="ivy-availability-check,ivy-configure">
<loadproperties prefix="ivyversions" srcFile="${common.dir}/ivy-versions.properties"/> <loadproperties prefix="ivyversions" srcFile="${common.dir}/ivy-versions.properties"/>
<ivy:cachepath organisation="com.ibm.icu" module="icu4j" revision="${ivyversions./com.ibm.icu/icu4j}" <ivy:cachepath organisation="com.ibm.icu" module="icu4j" revision="${ivyversions./com.ibm.icu/icu4j}"
inline="true" conf="default" transitive="true" pathid="icu4j.classpath"/> inline="true" conf="default" transitive="true" pathid="icu4j.classpath"/>
<property name="icu4j.resolved" value="true"/> <property name="icu4j.resolved" value="true"/>
</target> </target>
<target name="unicode-data" depends="-resolve-icu4j,resolve-groovy"> <target name="unicode-data" depends="-resolve-icu4j,resolve-groovy">
<groovy classpathref="icu4j.classpath" src="src/tools/groovy/generate-unicode-data.groovy"/> <groovy classpathref="icu4j.classpath" src="src/tools/groovy/generate-unicode-data.groovy"/>
<fixcrlf file="${unicode-props-file}" encoding="UTF-8"/> <fixcrlf file="${unicode-props-file}" encoding="UTF-8"/>
</target> </target>
<property name="tld.zones" value="http://www.internic.net/zones/root.zone"/> <property name="tld.zones" value="http://www.internic.net/zones/root.zone"/>
<property name="tld.output" location="src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro"/> <property name="tld.output" location="src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro"/>
<target name="gen-tlds" depends="compile-tools"> <target name="gen-tlds" depends="compile-tools">
<java <java
classname="org.apache.lucene.analysis.standard.GenerateJflexTLDMacros" classname="org.apache.lucene.analysis.standard.GenerateJflexTLDMacros"
dir="." dir="."
fork="true" fork="true"
failonerror="true"> failonerror="true">
<classpath> <classpath>
<pathelement location="${build.dir}/classes/tools"/> <pathelement location="${build.dir}/classes/tools"/>
</classpath> </classpath>
@ -117,8 +117,8 @@
<target name="compile-tools" depends="common.compile-tools"> <target name="compile-tools" depends="common.compile-tools">
<compile <compile
srcdir="src/tools/java" srcdir="src/tools/java"
destdir="${build.dir}/classes/tools"> destdir="${build.dir}/classes/tools">
<classpath refid="classpath"/> <classpath refid="classpath"/>
</compile> </compile>
</target> </target>
@ -126,15 +126,73 @@
<target name="javadocs" depends="module-build.javadocs"/> <target name="javadocs" depends="module-build.javadocs"/>
<target name="regenerate" depends="jflex,unicode-data"/> <target name="regenerate" depends="jflex,unicode-data"/>
<target name="patch-snowball" description="Patches all snowball programs in '${snowball.programs.dir}' to make them work with MethodHandles"> <target name="patch-snowball" description="Patches all snowball programs in '${snowball.programs.dir}' to make them work with MethodHandles">
<fileset id="snowball.programs" dir="${snowball.programs.dir}" includes="*Stemmer.java"/> <fileset id="snowball.programs" dir="${snowball.programs.dir}" includes="*Stemmer.java"/>
<replaceregexp match="^public class \w+Stemmer\b" replace="@SuppressWarnings(&quot;unused&quot;) \0" flags="m" encoding="UTF-8">
<replaceregexp match="^public class \w+Stemmer\b" replace="@SuppressWarnings(&quot;unused&quot;) \0" flags="m" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/> <fileset refid="snowball.programs"/>
</replaceregexp> <rsel:not>
<replaceregexp match="private final static \w+Stemmer methodObject\b.*$" replace="/* patched */ private static final java.lang.invoke.MethodHandles.Lookup methodObject = java.lang.invoke.MethodHandles.lookup();" flags="m" encoding="UTF-8"> <rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="new Among\(([^,]*,[^,]*,[^,]*?)(?=\))" replace="\0, &quot;&quot;, methodObject" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/> <fileset refid="snowball.programs"/>
</replaceregexp> <rsel:not>
<fixcrlf srcdir="${snowball.programs.dir}" includes="*Stemmer.java" tab="remove" tablength="2" encoding="UTF-8" javafiles="yes" fixlast="yes"/> <rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="(new Among\([^,]*,[^,]*,[^,]*,[^,]*,)[^,]*?(?=\))" replace="\1 methodObject" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="(?:find_among(?:|_b)\()(.*?)(?=\))" replace="\0, \1.length" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="current" replace="getCurrent()" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="(?:eq_s(?:|_b)\()(.*?)(?=\))" replace="\0.length(),\1" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="private static final long serialVersionUID(.*)" replace="private static final long serialVersionUID = 1L; ${line.separator}${line.separator} /* patched */ private static final java.lang.invoke.MethodHandles.Lookup methodObject = java.lang.invoke.MethodHandles.lookup();" flags="m" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<fixcrlf srcdir="${snowball.programs.dir}" includes="*Stemmer.java" tab="remove" tablength="2" encoding="UTF-8" javafiles="yes" fixlast="yes"/>
</target> </target>
</project> </project>

File diff suppressed because it is too large Load Diff

View File

@ -36,6 +36,7 @@ public class TestSnowballVocab extends LuceneTestCase {
* Run all languages against their snowball vocabulary tests. * Run all languages against their snowball vocabulary tests.
*/ */
public void testStemmers() throws IOException { public void testStemmers() throws IOException {
assertCorrectOutput("Arabic", "arabic");
assertCorrectOutput("Danish", "danish"); assertCorrectOutput("Danish", "danish");
assertCorrectOutput("Dutch", "dutch"); assertCorrectOutput("Dutch", "dutch");
assertCorrectOutput("English", "english"); assertCorrectOutput("English", "english");