Added an arabic snowball stemmer and test dataset

This change adds an Arabic snowball stemmer based on snowballstem.org
as well as an arabic test dataset in `TestSnowballVocabData.zip`
It also updates the `ant patch-snowball` target to be compatible with
the java classes generated by the last snowball version (tree:
1964ce688cbeca505263c8f77e16ed923296ce7a). The `ant patch-snowball` target
is retro-compatible with the version of snowball stemmers used in
lucene 7.x and ignores already patched classes.

Signed-off-by: Jim Ferenczi <jimczi@apache.org>
This commit is contained in:
Ryadh Dahimene 2018-09-13 13:26:34 +01:00 committed by Jim Ferenczi
parent 6f291d402b
commit 5c567d4fbc
5 changed files with 2000 additions and 25 deletions

View File

@ -226,6 +226,10 @@ New Features
* LUCENE-8538: Add a Simple WKT Shape Parser for creating Lucene Geometries (Polygon, Line,
Rectangle) from WKT format. (Nick Knize)
* LUCENE-8462: Adds an Arabic snowball stemmer based on
https://github.com/snowballstem/snowball/blob/master/algorithms/arabic.sbl
(Ryadh Dahimene via Jim Ferenczi)
Improvements:
* LUCENE-8521: Change LatLonShape encoding to 7 dimensions instead of 6; where the

View File

@ -7,9 +7,9 @@
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -17,10 +17,10 @@
limitations under the License.
-->
<project name="analyzers-common" default="default">
<project name="analyzers-common" default="default" xmlns:rsel="antlib:org.apache.tools.ant.types.resources.selectors">
<description>
Analyzers for indexing content in different languages and domains.
Analyzers for indexing content in different languages and domains.
</description>
<!-- some files for testing that do not have license headers -->
@ -28,9 +28,9 @@
<property name="rat.additional-includes" value="src/tools/**"/>
<import file="../analysis-module-build.xml"/>
<property name="snowball.programs.dir" location="src/java/org/tartarus/snowball/ext"/>
<property name="snowball.programs.dir" location="src/java/org/tartarus/snowball/ext"/>
<property name="unicode-props-file" location="src/java/org/apache/lucene/analysis/util/UnicodeProps.java"/>
<target name="jflex" depends="-install-jflex,clean-jflex,-jflex-ClassicAnalyzer,-jflex-UAX29URLEmailTokenizer,
@ -66,7 +66,7 @@
<run-jflex-and-disable-buffer-expansion
dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
</target>
<target name="-jflex-ClassicAnalyzer" depends="init,-install-jflex">
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
</target>
@ -84,28 +84,28 @@
</fileset>
</delete>
</target>
<target xmlns:ivy="antlib:org.apache.ivy.ant" name="-resolve-icu4j" unless="icu4j.resolved" depends="ivy-availability-check,ivy-configure">
<loadproperties prefix="ivyversions" srcFile="${common.dir}/ivy-versions.properties"/>
<ivy:cachepath organisation="com.ibm.icu" module="icu4j" revision="${ivyversions./com.ibm.icu/icu4j}"
inline="true" conf="default" transitive="true" pathid="icu4j.classpath"/>
inline="true" conf="default" transitive="true" pathid="icu4j.classpath"/>
<property name="icu4j.resolved" value="true"/>
</target>
<target name="unicode-data" depends="-resolve-icu4j,resolve-groovy">
<groovy classpathref="icu4j.classpath" src="src/tools/groovy/generate-unicode-data.groovy"/>
<fixcrlf file="${unicode-props-file}" encoding="UTF-8"/>
</target>
<property name="tld.zones" value="http://www.internic.net/zones/root.zone"/>
<property name="tld.output" location="src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro"/>
<target name="gen-tlds" depends="compile-tools">
<java
classname="org.apache.lucene.analysis.standard.GenerateJflexTLDMacros"
dir="."
fork="true"
failonerror="true">
classname="org.apache.lucene.analysis.standard.GenerateJflexTLDMacros"
dir="."
fork="true"
failonerror="true">
<classpath>
<pathelement location="${build.dir}/classes/tools"/>
</classpath>
@ -117,8 +117,8 @@
<target name="compile-tools" depends="common.compile-tools">
<compile
srcdir="src/tools/java"
destdir="${build.dir}/classes/tools">
srcdir="src/tools/java"
destdir="${build.dir}/classes/tools">
<classpath refid="classpath"/>
</compile>
</target>
@ -126,15 +126,73 @@
<target name="javadocs" depends="module-build.javadocs"/>
<target name="regenerate" depends="jflex,unicode-data"/>
<target name="patch-snowball" description="Patches all snowball programs in '${snowball.programs.dir}' to make them work with MethodHandles">
<fileset id="snowball.programs" dir="${snowball.programs.dir}" includes="*Stemmer.java"/>
<replaceregexp match="^public class \w+Stemmer\b" replace="@SuppressWarnings(&quot;unused&quot;) \0" flags="m" encoding="UTF-8">
<fileset id="snowball.programs" dir="${snowball.programs.dir}" includes="*Stemmer.java"/>
<replaceregexp match="^public class \w+Stemmer\b" replace="@SuppressWarnings(&quot;unused&quot;) \0" flags="m" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
</replaceregexp>
<replaceregexp match="private final static \w+Stemmer methodObject\b.*$" replace="/* patched */ private static final java.lang.invoke.MethodHandles.Lookup methodObject = java.lang.invoke.MethodHandles.lookup();" flags="m" encoding="UTF-8">
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="new Among\(([^,]*,[^,]*,[^,]*?)(?=\))" replace="\0, &quot;&quot;, methodObject" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
</replaceregexp>
<fixcrlf srcdir="${snowball.programs.dir}" includes="*Stemmer.java" tab="remove" tablength="2" encoding="UTF-8" javafiles="yes" fixlast="yes"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="(new Among\([^,]*,[^,]*,[^,]*,[^,]*,)[^,]*?(?=\))" replace="\1 methodObject" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="(?:find_among(?:|_b)\()(.*?)(?=\))" replace="\0, \1.length" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="current" replace="getCurrent()" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="(?:eq_s(?:|_b)\()(.*?)(?=\))" replace="\0.length(),\1" flags="g" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<replaceregexp match="private static final long serialVersionUID(.*)" replace="private static final long serialVersionUID = 1L; ${line.separator}${line.separator} /* patched */ private static final java.lang.invoke.MethodHandles.Lookup methodObject = java.lang.invoke.MethodHandles.lookup();" flags="m" encoding="UTF-8">
<restrict>
<fileset refid="snowball.programs"/>
<rsel:not>
<rsel:contains text="patched"/>
</rsel:not>
</restrict>
</replaceregexp>
<fixcrlf srcdir="${snowball.programs.dir}" includes="*Stemmer.java" tab="remove" tablength="2" encoding="UTF-8" javafiles="yes" fixlast="yes"/>
</target>
</project>

File diff suppressed because it is too large Load Diff

View File

@ -36,6 +36,7 @@ public class TestSnowballVocab extends LuceneTestCase {
* Run all languages against their snowball vocabulary tests.
*/
public void testStemmers() throws IOException {
assertCorrectOutput("Arabic", "arabic");
assertCorrectOutput("Danish", "danish");
assertCorrectOutput("Dutch", "dutch");
assertCorrectOutput("English", "english");