mirror of https://github.com/apache/lucene.git
Added an arabic snowball stemmer and test dataset
This change adds an Arabic snowball stemmer based on snowballstem.org as well as an arabic test dataset in `TestSnowballVocabData.zip` It also updates the `ant patch-snowball` target to be compatible with the java classes generated by the last snowball version (tree: 1964ce688cbeca505263c8f77e16ed923296ce7a). The `ant patch-snowball` target is retro-compatible with the version of snowball stemmers used in lucene 7.x and ignores already patched classes. Signed-off-by: Jim Ferenczi <jimczi@apache.org>
This commit is contained in:
parent
6f291d402b
commit
5c567d4fbc
|
@ -226,6 +226,10 @@ New Features
|
|||
* LUCENE-8538: Add a Simple WKT Shape Parser for creating Lucene Geometries (Polygon, Line,
|
||||
Rectangle) from WKT format. (Nick Knize)
|
||||
|
||||
* LUCENE-8462: Adds an Arabic snowball stemmer based on
|
||||
https://github.com/snowballstem/snowball/blob/master/algorithms/arabic.sbl
|
||||
(Ryadh Dahimene via Jim Ferenczi)
|
||||
|
||||
Improvements:
|
||||
|
||||
* LUCENE-8521: Change LatLonShape encoding to 7 dimensions instead of 6; where the
|
||||
|
|
|
@ -7,9 +7,9 @@
|
|||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -17,10 +17,10 @@
|
|||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="analyzers-common" default="default">
|
||||
<project name="analyzers-common" default="default" xmlns:rsel="antlib:org.apache.tools.ant.types.resources.selectors">
|
||||
|
||||
<description>
|
||||
Analyzers for indexing content in different languages and domains.
|
||||
Analyzers for indexing content in different languages and domains.
|
||||
</description>
|
||||
|
||||
<!-- some files for testing that do not have license headers -->
|
||||
|
@ -28,9 +28,9 @@
|
|||
<property name="rat.additional-includes" value="src/tools/**"/>
|
||||
|
||||
<import file="../analysis-module-build.xml"/>
|
||||
|
||||
<property name="snowball.programs.dir" location="src/java/org/tartarus/snowball/ext"/>
|
||||
|
||||
|
||||
<property name="snowball.programs.dir" location="src/java/org/tartarus/snowball/ext"/>
|
||||
|
||||
<property name="unicode-props-file" location="src/java/org/apache/lucene/analysis/util/UnicodeProps.java"/>
|
||||
|
||||
<target name="jflex" depends="-install-jflex,clean-jflex,-jflex-ClassicAnalyzer,-jflex-UAX29URLEmailTokenizer,
|
||||
|
@ -66,7 +66,7 @@
|
|||
<run-jflex-and-disable-buffer-expansion
|
||||
dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
|
||||
</target>
|
||||
|
||||
|
||||
<target name="-jflex-ClassicAnalyzer" depends="init,-install-jflex">
|
||||
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
|
||||
</target>
|
||||
|
@ -84,28 +84,28 @@
|
|||
</fileset>
|
||||
</delete>
|
||||
</target>
|
||||
|
||||
|
||||
<target xmlns:ivy="antlib:org.apache.ivy.ant" name="-resolve-icu4j" unless="icu4j.resolved" depends="ivy-availability-check,ivy-configure">
|
||||
<loadproperties prefix="ivyversions" srcFile="${common.dir}/ivy-versions.properties"/>
|
||||
<ivy:cachepath organisation="com.ibm.icu" module="icu4j" revision="${ivyversions./com.ibm.icu/icu4j}"
|
||||
inline="true" conf="default" transitive="true" pathid="icu4j.classpath"/>
|
||||
inline="true" conf="default" transitive="true" pathid="icu4j.classpath"/>
|
||||
<property name="icu4j.resolved" value="true"/>
|
||||
</target>
|
||||
|
||||
|
||||
<target name="unicode-data" depends="-resolve-icu4j,resolve-groovy">
|
||||
<groovy classpathref="icu4j.classpath" src="src/tools/groovy/generate-unicode-data.groovy"/>
|
||||
<fixcrlf file="${unicode-props-file}" encoding="UTF-8"/>
|
||||
</target>
|
||||
|
||||
|
||||
<property name="tld.zones" value="http://www.internic.net/zones/root.zone"/>
|
||||
<property name="tld.output" location="src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro"/>
|
||||
|
||||
<target name="gen-tlds" depends="compile-tools">
|
||||
<java
|
||||
classname="org.apache.lucene.analysis.standard.GenerateJflexTLDMacros"
|
||||
dir="."
|
||||
fork="true"
|
||||
failonerror="true">
|
||||
classname="org.apache.lucene.analysis.standard.GenerateJflexTLDMacros"
|
||||
dir="."
|
||||
fork="true"
|
||||
failonerror="true">
|
||||
<classpath>
|
||||
<pathelement location="${build.dir}/classes/tools"/>
|
||||
</classpath>
|
||||
|
@ -117,8 +117,8 @@
|
|||
|
||||
<target name="compile-tools" depends="common.compile-tools">
|
||||
<compile
|
||||
srcdir="src/tools/java"
|
||||
destdir="${build.dir}/classes/tools">
|
||||
srcdir="src/tools/java"
|
||||
destdir="${build.dir}/classes/tools">
|
||||
<classpath refid="classpath"/>
|
||||
</compile>
|
||||
</target>
|
||||
|
@ -126,15 +126,73 @@
|
|||
<target name="javadocs" depends="module-build.javadocs"/>
|
||||
|
||||
<target name="regenerate" depends="jflex,unicode-data"/>
|
||||
|
||||
|
||||
<target name="patch-snowball" description="Patches all snowball programs in '${snowball.programs.dir}' to make them work with MethodHandles">
|
||||
<fileset id="snowball.programs" dir="${snowball.programs.dir}" includes="*Stemmer.java"/>
|
||||
<replaceregexp match="^public class \w+Stemmer\b" replace="@SuppressWarnings("unused") \0" flags="m" encoding="UTF-8">
|
||||
<fileset id="snowball.programs" dir="${snowball.programs.dir}" includes="*Stemmer.java"/>
|
||||
|
||||
<replaceregexp match="^public class \w+Stemmer\b" replace="@SuppressWarnings("unused") \0" flags="m" encoding="UTF-8">
|
||||
<restrict>
|
||||
<fileset refid="snowball.programs"/>
|
||||
</replaceregexp>
|
||||
<replaceregexp match="private final static \w+Stemmer methodObject\b.*$" replace="/* patched */ private static final java.lang.invoke.MethodHandles.Lookup methodObject = java.lang.invoke.MethodHandles.lookup();" flags="m" encoding="UTF-8">
|
||||
<rsel:not>
|
||||
<rsel:contains text="patched"/>
|
||||
</rsel:not>
|
||||
</restrict>
|
||||
</replaceregexp>
|
||||
|
||||
<replaceregexp match="new Among\(([^,]*,[^,]*,[^,]*?)(?=\))" replace="\0, "", methodObject" flags="g" encoding="UTF-8">
|
||||
<restrict>
|
||||
<fileset refid="snowball.programs"/>
|
||||
</replaceregexp>
|
||||
<fixcrlf srcdir="${snowball.programs.dir}" includes="*Stemmer.java" tab="remove" tablength="2" encoding="UTF-8" javafiles="yes" fixlast="yes"/>
|
||||
<rsel:not>
|
||||
<rsel:contains text="patched"/>
|
||||
</rsel:not>
|
||||
</restrict>
|
||||
</replaceregexp>
|
||||
|
||||
<replaceregexp match="(new Among\([^,]*,[^,]*,[^,]*,[^,]*,)[^,]*?(?=\))" replace="\1 methodObject" flags="g" encoding="UTF-8">
|
||||
<restrict>
|
||||
<fileset refid="snowball.programs"/>
|
||||
<rsel:not>
|
||||
<rsel:contains text="patched"/>
|
||||
</rsel:not>
|
||||
</restrict>
|
||||
</replaceregexp>
|
||||
|
||||
<replaceregexp match="(?:find_among(?:|_b)\()(.*?)(?=\))" replace="\0, \1.length" flags="g" encoding="UTF-8">
|
||||
<restrict>
|
||||
<fileset refid="snowball.programs"/>
|
||||
<rsel:not>
|
||||
<rsel:contains text="patched"/>
|
||||
</rsel:not>
|
||||
</restrict>
|
||||
</replaceregexp>
|
||||
|
||||
<replaceregexp match="current" replace="getCurrent()" flags="g" encoding="UTF-8">
|
||||
<restrict>
|
||||
<fileset refid="snowball.programs"/>
|
||||
<rsel:not>
|
||||
<rsel:contains text="patched"/>
|
||||
</rsel:not>
|
||||
</restrict>
|
||||
</replaceregexp>
|
||||
|
||||
<replaceregexp match="(?:eq_s(?:|_b)\()(.*?)(?=\))" replace="\0.length(),\1" flags="g" encoding="UTF-8">
|
||||
<restrict>
|
||||
<fileset refid="snowball.programs"/>
|
||||
<rsel:not>
|
||||
<rsel:contains text="patched"/>
|
||||
</rsel:not>
|
||||
</restrict>
|
||||
</replaceregexp>
|
||||
|
||||
<replaceregexp match="private static final long serialVersionUID(.*)" replace="private static final long serialVersionUID = 1L; ${line.separator}${line.separator} /* patched */ private static final java.lang.invoke.MethodHandles.Lookup methodObject = java.lang.invoke.MethodHandles.lookup();" flags="m" encoding="UTF-8">
|
||||
<restrict>
|
||||
<fileset refid="snowball.programs"/>
|
||||
<rsel:not>
|
||||
<rsel:contains text="patched"/>
|
||||
</rsel:not>
|
||||
</restrict>
|
||||
</replaceregexp>
|
||||
|
||||
<fixcrlf srcdir="${snowball.programs.dir}" includes="*Stemmer.java" tab="remove" tablength="2" encoding="UTF-8" javafiles="yes" fixlast="yes"/>
|
||||
</target>
|
||||
</project>
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -36,6 +36,7 @@ public class TestSnowballVocab extends LuceneTestCase {
|
|||
* Run all languages against their snowball vocabulary tests.
|
||||
*/
|
||||
public void testStemmers() throws IOException {
|
||||
assertCorrectOutput("Arabic", "arabic");
|
||||
assertCorrectOutput("Danish", "danish");
|
||||
assertCorrectOutput("Dutch", "dutch");
|
||||
assertCorrectOutput("English", "english");
|
||||
|
|
Binary file not shown.
Loading…
Reference in New Issue