mirror of https://github.com/apache/lucene.git
Added an arabic snowball stemmer and test dataset
This change adds an Arabic snowball stemmer based on snowballstem.org as well as an arabic test dataset in `TestSnowballVocabData.zip` It also updates the `ant patch-snowball` target to be compatible with the java classes generated by the last snowball version (tree: 1964ce688cbeca505263c8f77e16ed923296ce7a). The `ant patch-snowball` target is retro-compatible with the version of snowball stemmers used in lucene 7.x and ignores already patched classes. Signed-off-by: Jim Ferenczi <jimczi@apache.org>
This commit is contained in:
parent
6f291d402b
commit
5c567d4fbc
|
@ -226,6 +226,10 @@ New Features
|
|||
* LUCENE-8538: Add a Simple WKT Shape Parser for creating Lucene Geometries (Polygon, Line,
|
||||
Rectangle) from WKT format. (Nick Knize)
|
||||
|
||||
* LUCENE-8462: Adds an Arabic snowball stemmer based on
|
||||
https://github.com/snowballstem/snowball/blob/master/algorithms/arabic.sbl
|
||||
(Ryadh Dahimene via Jim Ferenczi)
|
||||
|
||||
Improvements:
|
||||
|
||||
* LUCENE-8521: Change LatLonShape encoding to 7 dimensions instead of 6; where the
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="analyzers-common" default="default">
|
||||
<project name="analyzers-common" default="default" xmlns:rsel="antlib:org.apache.tools.ant.types.resources.selectors">
|
||||
|
||||
<description>
|
||||
Analyzers for indexing content in different languages and domains.
|
||||
|
@ -129,12 +129,70 @@
|
|||
|
||||
<target name="patch-snowball" description="Patches all snowball programs in '${snowball.programs.dir}' to make them work with MethodHandles">
|
||||
<fileset id="snowball.programs" dir="${snowball.programs.dir}" includes="*Stemmer.java"/>
|
||||
|
||||
<replaceregexp match="^public class \w+Stemmer\b" replace="@SuppressWarnings("unused") \0" flags="m" encoding="UTF-8">
|
||||
<restrict>
|
||||
<fileset refid="snowball.programs"/>
|
||||
<rsel:not>
|
||||
<rsel:contains text="patched"/>
|
||||
</rsel:not>
|
||||
</restrict>
|
||||
</replaceregexp>
|
||||
<replaceregexp match="private final static \w+Stemmer methodObject\b.*$" replace="/* patched */ private static final java.lang.invoke.MethodHandles.Lookup methodObject = java.lang.invoke.MethodHandles.lookup();" flags="m" encoding="UTF-8">
|
||||
|
||||
<replaceregexp match="new Among\(([^,]*,[^,]*,[^,]*?)(?=\))" replace="\0, "", methodObject" flags="g" encoding="UTF-8">
|
||||
<restrict>
|
||||
<fileset refid="snowball.programs"/>
|
||||
<rsel:not>
|
||||
<rsel:contains text="patched"/>
|
||||
</rsel:not>
|
||||
</restrict>
|
||||
</replaceregexp>
|
||||
|
||||
<replaceregexp match="(new Among\([^,]*,[^,]*,[^,]*,[^,]*,)[^,]*?(?=\))" replace="\1 methodObject" flags="g" encoding="UTF-8">
|
||||
<restrict>
|
||||
<fileset refid="snowball.programs"/>
|
||||
<rsel:not>
|
||||
<rsel:contains text="patched"/>
|
||||
</rsel:not>
|
||||
</restrict>
|
||||
</replaceregexp>
|
||||
|
||||
<replaceregexp match="(?:find_among(?:|_b)\()(.*?)(?=\))" replace="\0, \1.length" flags="g" encoding="UTF-8">
|
||||
<restrict>
|
||||
<fileset refid="snowball.programs"/>
|
||||
<rsel:not>
|
||||
<rsel:contains text="patched"/>
|
||||
</rsel:not>
|
||||
</restrict>
|
||||
</replaceregexp>
|
||||
|
||||
<replaceregexp match="current" replace="getCurrent()" flags="g" encoding="UTF-8">
|
||||
<restrict>
|
||||
<fileset refid="snowball.programs"/>
|
||||
<rsel:not>
|
||||
<rsel:contains text="patched"/>
|
||||
</rsel:not>
|
||||
</restrict>
|
||||
</replaceregexp>
|
||||
|
||||
<replaceregexp match="(?:eq_s(?:|_b)\()(.*?)(?=\))" replace="\0.length(),\1" flags="g" encoding="UTF-8">
|
||||
<restrict>
|
||||
<fileset refid="snowball.programs"/>
|
||||
<rsel:not>
|
||||
<rsel:contains text="patched"/>
|
||||
</rsel:not>
|
||||
</restrict>
|
||||
</replaceregexp>
|
||||
|
||||
<replaceregexp match="private static final long serialVersionUID(.*)" replace="private static final long serialVersionUID = 1L; ${line.separator}${line.separator} /* patched */ private static final java.lang.invoke.MethodHandles.Lookup methodObject = java.lang.invoke.MethodHandles.lookup();" flags="m" encoding="UTF-8">
|
||||
<restrict>
|
||||
<fileset refid="snowball.programs"/>
|
||||
<rsel:not>
|
||||
<rsel:contains text="patched"/>
|
||||
</rsel:not>
|
||||
</restrict>
|
||||
</replaceregexp>
|
||||
|
||||
<fixcrlf srcdir="${snowball.programs.dir}" includes="*Stemmer.java" tab="remove" tablength="2" encoding="UTF-8" javafiles="yes" fixlast="yes"/>
|
||||
</target>
|
||||
</project>
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -36,6 +36,7 @@ public class TestSnowballVocab extends LuceneTestCase {
|
|||
* Run all languages against their snowball vocabulary tests.
|
||||
*/
|
||||
public void testStemmers() throws IOException {
|
||||
assertCorrectOutput("Arabic", "arabic");
|
||||
assertCorrectOutput("Danish", "danish");
|
||||
assertCorrectOutput("Dutch", "dutch");
|
||||
assertCorrectOutput("English", "english");
|
||||
|
|
Binary file not shown.
Loading…
Reference in New Issue