mirror of https://github.com/apache/lucene.git
Added an arabic snowball stemmer and test dataset
This change adds an Arabic snowball stemmer based on snowballstem.org as well as an arabic test dataset in `TestSnowballVocabData.zip` It also updates the `ant patch-snowball` target to be compatible with the java classes generated by the last snowball version (tree: 1964ce688cbeca505263c8f77e16ed923296ce7a). The `ant patch-snowball` target is retro-compatible with the version of snowball stemmers used in lucene 7.x and ignores already patched classes. Signed-off-by: Jim Ferenczi <jimczi@apache.org>
This commit is contained in:
parent
6f291d402b
commit
5c567d4fbc
|
@ -226,6 +226,10 @@ New Features
|
||||||
* LUCENE-8538: Add a Simple WKT Shape Parser for creating Lucene Geometries (Polygon, Line,
|
* LUCENE-8538: Add a Simple WKT Shape Parser for creating Lucene Geometries (Polygon, Line,
|
||||||
Rectangle) from WKT format. (Nick Knize)
|
Rectangle) from WKT format. (Nick Knize)
|
||||||
|
|
||||||
|
* LUCENE-8462: Adds an Arabic snowball stemmer based on
|
||||||
|
https://github.com/snowballstem/snowball/blob/master/algorithms/arabic.sbl
|
||||||
|
(Ryadh Dahimene via Jim Ferenczi)
|
||||||
|
|
||||||
Improvements:
|
Improvements:
|
||||||
|
|
||||||
* LUCENE-8521: Change LatLonShape encoding to 7 dimensions instead of 6; where the
|
* LUCENE-8521: Change LatLonShape encoding to 7 dimensions instead of 6; where the
|
||||||
|
|
|
@ -7,9 +7,9 @@
|
||||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
the "License"); you may not use this file except in compliance with
|
the "License"); you may not use this file except in compliance with
|
||||||
the License. You may obtain a copy of the License at
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
Unless required by applicable law or agreed to in writing, software
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
@ -17,10 +17,10 @@
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
-->
|
-->
|
||||||
|
|
||||||
<project name="analyzers-common" default="default">
|
<project name="analyzers-common" default="default" xmlns:rsel="antlib:org.apache.tools.ant.types.resources.selectors">
|
||||||
|
|
||||||
<description>
|
<description>
|
||||||
Analyzers for indexing content in different languages and domains.
|
Analyzers for indexing content in different languages and domains.
|
||||||
</description>
|
</description>
|
||||||
|
|
||||||
<!-- some files for testing that do not have license headers -->
|
<!-- some files for testing that do not have license headers -->
|
||||||
|
@ -28,9 +28,9 @@
|
||||||
<property name="rat.additional-includes" value="src/tools/**"/>
|
<property name="rat.additional-includes" value="src/tools/**"/>
|
||||||
|
|
||||||
<import file="../analysis-module-build.xml"/>
|
<import file="../analysis-module-build.xml"/>
|
||||||
|
|
||||||
<property name="snowball.programs.dir" location="src/java/org/tartarus/snowball/ext"/>
|
<property name="snowball.programs.dir" location="src/java/org/tartarus/snowball/ext"/>
|
||||||
|
|
||||||
<property name="unicode-props-file" location="src/java/org/apache/lucene/analysis/util/UnicodeProps.java"/>
|
<property name="unicode-props-file" location="src/java/org/apache/lucene/analysis/util/UnicodeProps.java"/>
|
||||||
|
|
||||||
<target name="jflex" depends="-install-jflex,clean-jflex,-jflex-ClassicAnalyzer,-jflex-UAX29URLEmailTokenizer,
|
<target name="jflex" depends="-install-jflex,clean-jflex,-jflex-ClassicAnalyzer,-jflex-UAX29URLEmailTokenizer,
|
||||||
|
@ -66,7 +66,7 @@
|
||||||
<run-jflex-and-disable-buffer-expansion
|
<run-jflex-and-disable-buffer-expansion
|
||||||
dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
|
dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="-jflex-ClassicAnalyzer" depends="init,-install-jflex">
|
<target name="-jflex-ClassicAnalyzer" depends="init,-install-jflex">
|
||||||
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
|
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
|
||||||
</target>
|
</target>
|
||||||
|
@ -84,28 +84,28 @@
|
||||||
</fileset>
|
</fileset>
|
||||||
</delete>
|
</delete>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target xmlns:ivy="antlib:org.apache.ivy.ant" name="-resolve-icu4j" unless="icu4j.resolved" depends="ivy-availability-check,ivy-configure">
|
<target xmlns:ivy="antlib:org.apache.ivy.ant" name="-resolve-icu4j" unless="icu4j.resolved" depends="ivy-availability-check,ivy-configure">
|
||||||
<loadproperties prefix="ivyversions" srcFile="${common.dir}/ivy-versions.properties"/>
|
<loadproperties prefix="ivyversions" srcFile="${common.dir}/ivy-versions.properties"/>
|
||||||
<ivy:cachepath organisation="com.ibm.icu" module="icu4j" revision="${ivyversions./com.ibm.icu/icu4j}"
|
<ivy:cachepath organisation="com.ibm.icu" module="icu4j" revision="${ivyversions./com.ibm.icu/icu4j}"
|
||||||
inline="true" conf="default" transitive="true" pathid="icu4j.classpath"/>
|
inline="true" conf="default" transitive="true" pathid="icu4j.classpath"/>
|
||||||
<property name="icu4j.resolved" value="true"/>
|
<property name="icu4j.resolved" value="true"/>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="unicode-data" depends="-resolve-icu4j,resolve-groovy">
|
<target name="unicode-data" depends="-resolve-icu4j,resolve-groovy">
|
||||||
<groovy classpathref="icu4j.classpath" src="src/tools/groovy/generate-unicode-data.groovy"/>
|
<groovy classpathref="icu4j.classpath" src="src/tools/groovy/generate-unicode-data.groovy"/>
|
||||||
<fixcrlf file="${unicode-props-file}" encoding="UTF-8"/>
|
<fixcrlf file="${unicode-props-file}" encoding="UTF-8"/>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<property name="tld.zones" value="http://www.internic.net/zones/root.zone"/>
|
<property name="tld.zones" value="http://www.internic.net/zones/root.zone"/>
|
||||||
<property name="tld.output" location="src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro"/>
|
<property name="tld.output" location="src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro"/>
|
||||||
|
|
||||||
<target name="gen-tlds" depends="compile-tools">
|
<target name="gen-tlds" depends="compile-tools">
|
||||||
<java
|
<java
|
||||||
classname="org.apache.lucene.analysis.standard.GenerateJflexTLDMacros"
|
classname="org.apache.lucene.analysis.standard.GenerateJflexTLDMacros"
|
||||||
dir="."
|
dir="."
|
||||||
fork="true"
|
fork="true"
|
||||||
failonerror="true">
|
failonerror="true">
|
||||||
<classpath>
|
<classpath>
|
||||||
<pathelement location="${build.dir}/classes/tools"/>
|
<pathelement location="${build.dir}/classes/tools"/>
|
||||||
</classpath>
|
</classpath>
|
||||||
|
@ -117,8 +117,8 @@
|
||||||
|
|
||||||
<target name="compile-tools" depends="common.compile-tools">
|
<target name="compile-tools" depends="common.compile-tools">
|
||||||
<compile
|
<compile
|
||||||
srcdir="src/tools/java"
|
srcdir="src/tools/java"
|
||||||
destdir="${build.dir}/classes/tools">
|
destdir="${build.dir}/classes/tools">
|
||||||
<classpath refid="classpath"/>
|
<classpath refid="classpath"/>
|
||||||
</compile>
|
</compile>
|
||||||
</target>
|
</target>
|
||||||
|
@ -126,15 +126,73 @@
|
||||||
<target name="javadocs" depends="module-build.javadocs"/>
|
<target name="javadocs" depends="module-build.javadocs"/>
|
||||||
|
|
||||||
<target name="regenerate" depends="jflex,unicode-data"/>
|
<target name="regenerate" depends="jflex,unicode-data"/>
|
||||||
|
|
||||||
<target name="patch-snowball" description="Patches all snowball programs in '${snowball.programs.dir}' to make them work with MethodHandles">
|
<target name="patch-snowball" description="Patches all snowball programs in '${snowball.programs.dir}' to make them work with MethodHandles">
|
||||||
<fileset id="snowball.programs" dir="${snowball.programs.dir}" includes="*Stemmer.java"/>
|
<fileset id="snowball.programs" dir="${snowball.programs.dir}" includes="*Stemmer.java"/>
|
||||||
<replaceregexp match="^public class \w+Stemmer\b" replace="@SuppressWarnings("unused") \0" flags="m" encoding="UTF-8">
|
|
||||||
|
<replaceregexp match="^public class \w+Stemmer\b" replace="@SuppressWarnings("unused") \0" flags="m" encoding="UTF-8">
|
||||||
|
<restrict>
|
||||||
<fileset refid="snowball.programs"/>
|
<fileset refid="snowball.programs"/>
|
||||||
</replaceregexp>
|
<rsel:not>
|
||||||
<replaceregexp match="private final static \w+Stemmer methodObject\b.*$" replace="/* patched */ private static final java.lang.invoke.MethodHandles.Lookup methodObject = java.lang.invoke.MethodHandles.lookup();" flags="m" encoding="UTF-8">
|
<rsel:contains text="patched"/>
|
||||||
|
</rsel:not>
|
||||||
|
</restrict>
|
||||||
|
</replaceregexp>
|
||||||
|
|
||||||
|
<replaceregexp match="new Among\(([^,]*,[^,]*,[^,]*?)(?=\))" replace="\0, "", methodObject" flags="g" encoding="UTF-8">
|
||||||
|
<restrict>
|
||||||
<fileset refid="snowball.programs"/>
|
<fileset refid="snowball.programs"/>
|
||||||
</replaceregexp>
|
<rsel:not>
|
||||||
<fixcrlf srcdir="${snowball.programs.dir}" includes="*Stemmer.java" tab="remove" tablength="2" encoding="UTF-8" javafiles="yes" fixlast="yes"/>
|
<rsel:contains text="patched"/>
|
||||||
|
</rsel:not>
|
||||||
|
</restrict>
|
||||||
|
</replaceregexp>
|
||||||
|
|
||||||
|
<replaceregexp match="(new Among\([^,]*,[^,]*,[^,]*,[^,]*,)[^,]*?(?=\))" replace="\1 methodObject" flags="g" encoding="UTF-8">
|
||||||
|
<restrict>
|
||||||
|
<fileset refid="snowball.programs"/>
|
||||||
|
<rsel:not>
|
||||||
|
<rsel:contains text="patched"/>
|
||||||
|
</rsel:not>
|
||||||
|
</restrict>
|
||||||
|
</replaceregexp>
|
||||||
|
|
||||||
|
<replaceregexp match="(?:find_among(?:|_b)\()(.*?)(?=\))" replace="\0, \1.length" flags="g" encoding="UTF-8">
|
||||||
|
<restrict>
|
||||||
|
<fileset refid="snowball.programs"/>
|
||||||
|
<rsel:not>
|
||||||
|
<rsel:contains text="patched"/>
|
||||||
|
</rsel:not>
|
||||||
|
</restrict>
|
||||||
|
</replaceregexp>
|
||||||
|
|
||||||
|
<replaceregexp match="current" replace="getCurrent()" flags="g" encoding="UTF-8">
|
||||||
|
<restrict>
|
||||||
|
<fileset refid="snowball.programs"/>
|
||||||
|
<rsel:not>
|
||||||
|
<rsel:contains text="patched"/>
|
||||||
|
</rsel:not>
|
||||||
|
</restrict>
|
||||||
|
</replaceregexp>
|
||||||
|
|
||||||
|
<replaceregexp match="(?:eq_s(?:|_b)\()(.*?)(?=\))" replace="\0.length(),\1" flags="g" encoding="UTF-8">
|
||||||
|
<restrict>
|
||||||
|
<fileset refid="snowball.programs"/>
|
||||||
|
<rsel:not>
|
||||||
|
<rsel:contains text="patched"/>
|
||||||
|
</rsel:not>
|
||||||
|
</restrict>
|
||||||
|
</replaceregexp>
|
||||||
|
|
||||||
|
<replaceregexp match="private static final long serialVersionUID(.*)" replace="private static final long serialVersionUID = 1L; ${line.separator}${line.separator} /* patched */ private static final java.lang.invoke.MethodHandles.Lookup methodObject = java.lang.invoke.MethodHandles.lookup();" flags="m" encoding="UTF-8">
|
||||||
|
<restrict>
|
||||||
|
<fileset refid="snowball.programs"/>
|
||||||
|
<rsel:not>
|
||||||
|
<rsel:contains text="patched"/>
|
||||||
|
</rsel:not>
|
||||||
|
</restrict>
|
||||||
|
</replaceregexp>
|
||||||
|
|
||||||
|
<fixcrlf srcdir="${snowball.programs.dir}" includes="*Stemmer.java" tab="remove" tablength="2" encoding="UTF-8" javafiles="yes" fixlast="yes"/>
|
||||||
</target>
|
</target>
|
||||||
</project>
|
</project>
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -36,6 +36,7 @@ public class TestSnowballVocab extends LuceneTestCase {
|
||||||
* Run all languages against their snowball vocabulary tests.
|
* Run all languages against their snowball vocabulary tests.
|
||||||
*/
|
*/
|
||||||
public void testStemmers() throws IOException {
|
public void testStemmers() throws IOException {
|
||||||
|
assertCorrectOutput("Arabic", "arabic");
|
||||||
assertCorrectOutput("Danish", "danish");
|
assertCorrectOutput("Danish", "danish");
|
||||||
assertCorrectOutput("Dutch", "dutch");
|
assertCorrectOutput("Dutch", "dutch");
|
||||||
assertCorrectOutput("English", "english");
|
assertCorrectOutput("English", "english");
|
||||||
|
|
Binary file not shown.
Loading…
Reference in New Issue