mirror of https://github.com/apache/lucene.git
LUCENE-2341: integrating morfologik (Polish stemming/ morphosyntactic dictionary) as an analysis module.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1141671 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cec86dbc06
commit
29b09032d3
|
@ -38,6 +38,8 @@
|
||||||
<classpathentry kind="src" path="modules/analysis/stempel/src/java"/>
|
<classpathentry kind="src" path="modules/analysis/stempel/src/java"/>
|
||||||
<classpathentry kind="src" path="modules/analysis/stempel/src/resources"/>
|
<classpathentry kind="src" path="modules/analysis/stempel/src/resources"/>
|
||||||
<classpathentry kind="src" path="modules/analysis/stempel/src/test"/>
|
<classpathentry kind="src" path="modules/analysis/stempel/src/test"/>
|
||||||
|
<classpathentry kind="src" path="modules/analysis/morfologik/src/java"/>
|
||||||
|
<classpathentry kind="src" path="modules/analysis/morfologik/src/test"/>
|
||||||
<classpathentry kind="src" path="modules/benchmark/src/java"/>
|
<classpathentry kind="src" path="modules/benchmark/src/java"/>
|
||||||
<classpathentry kind="src" path="modules/benchmark/src/test"/>
|
<classpathentry kind="src" path="modules/benchmark/src/test"/>
|
||||||
<classpathentry kind="src" path="modules/common/src/java"/>
|
<classpathentry kind="src" path="modules/common/src/java"/>
|
||||||
|
@ -83,6 +85,9 @@
|
||||||
<classpathentry kind="lib" path="lucene/contrib/queries/lib/jakarta-regexp-1.4.jar"/>
|
<classpathentry kind="lib" path="lucene/contrib/queries/lib/jakarta-regexp-1.4.jar"/>
|
||||||
<classpathentry kind="lib" path="modules/analysis/icu/lib/icu4j-4_8.jar"/>
|
<classpathentry kind="lib" path="modules/analysis/icu/lib/icu4j-4_8.jar"/>
|
||||||
<classpathentry kind="lib" path="modules/analysis/phonetic/lib/commons-codec-1.4.jar"/>
|
<classpathentry kind="lib" path="modules/analysis/phonetic/lib/commons-codec-1.4.jar"/>
|
||||||
|
<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar"/>
|
||||||
|
<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-polish-1.5.2.jar"/>
|
||||||
|
<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar"/>
|
||||||
<classpathentry kind="lib" path="modules/benchmark/lib/commons-beanutils-1.7.0.jar"/>
|
<classpathentry kind="lib" path="modules/benchmark/lib/commons-beanutils-1.7.0.jar"/>
|
||||||
<classpathentry kind="lib" path="modules/benchmark/lib/commons-collections-3.1.jar"/>
|
<classpathentry kind="lib" path="modules/benchmark/lib/commons-collections-3.1.jar"/>
|
||||||
<classpathentry kind="lib" path="modules/benchmark/lib/commons-compress-1.1.jar"/>
|
<classpathentry kind="lib" path="modules/benchmark/lib/commons-compress-1.1.jar"/>
|
||||||
|
|
|
@ -34,6 +34,10 @@ API Changes
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
|
|
||||||
|
* LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer
|
||||||
|
(accurate stemmer) for Polish (includes morphosyntactic annotations).
|
||||||
|
(Michał Dybizbański, Dawid Weiss)
|
||||||
|
|
||||||
* LUCENE-2413: Consolidated Solr analysis components into common.
|
* LUCENE-2413: Consolidated Solr analysis components into common.
|
||||||
New features from Solr now available to Lucene users include:
|
New features from Solr now available to Lucene users include:
|
||||||
- o.a.l.analysis.commongrams: Constructs n-grams for frequently occurring terms
|
- o.a.l.analysis.commongrams: Constructs n-grams for frequently occurring terms
|
||||||
|
|
|
@ -263,3 +263,80 @@ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||||
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
SUCH DAMAGE.
|
SUCH DAMAGE.
|
||||||
|
|
||||||
|
The following license applies to the Morfologik project:
|
||||||
|
|
||||||
|
Copyright (c) 2006 Dawid Weiss
|
||||||
|
Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer in the documentation
|
||||||
|
and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Morfologik nor the names of its contributors
|
||||||
|
may be used to endorse or promote products derived from this software
|
||||||
|
without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||||
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
The dictionary comes from Morfologik project. Morfologik uses data from
|
||||||
|
Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and
|
||||||
|
is licenced on the terms of (inter alia) LGPL and Creative Commons
|
||||||
|
ShareAlike. The part-of-speech tags were added in Morfologik project and
|
||||||
|
are not found in the data from sjp.pl. The tagset is similar to IPI PAN
|
||||||
|
tagset.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
The following license applies to the Morfeusz project,
|
||||||
|
used by org.apache.lucene.analysis.morfologik.
|
||||||
|
|
||||||
|
BSD-licensed dictionary of Polish (SGJP)
|
||||||
|
http://sgjp.pl/morfeusz/
|
||||||
|
|
||||||
|
Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński,
|
||||||
|
Marcin Woliński, Robert Wołosz
|
||||||
|
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
|
||||||
|
OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||||
|
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||||
|
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||||
|
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||||
|
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
|
@ -62,3 +62,12 @@ WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/)
|
||||||
is derived from Unicode data such as the Unicode Character Database.
|
is derived from Unicode data such as the Unicode Character Database.
|
||||||
See http://unicode.org/copyright.html for more details.
|
See http://unicode.org/copyright.html for more details.
|
||||||
|
|
||||||
|
The Morfologik analyzer (morfologik) includes BSD-licensed software
|
||||||
|
developed by Dawid Weiss and Marcin Miłkowski (http://morfologik.blogspot.com/).
|
||||||
|
|
||||||
|
Morfologik uses data from Polish ispell/myspell dictionary
|
||||||
|
(http://www.sjp.pl/slownik/en/) licenced on the terms of (inter alia)
|
||||||
|
LGPL and Creative Commons ShareAlike.
|
||||||
|
|
||||||
|
Morfologic includes data from BSD-licensed dictionary of Polish (SGJP)
|
||||||
|
(http://sgjp.pl/morfeusz/)
|
||||||
|
|
|
@ -35,11 +35,15 @@ lucene-analyzers-stempel-XX.jar
|
||||||
An add-on analysis library that contains a universal algorithmic stemmer,
|
An add-on analysis library that contains a universal algorithmic stemmer,
|
||||||
including tables for the Polish language.
|
including tables for the Polish language.
|
||||||
|
|
||||||
|
lucene-analyzers-morfologik-XX.jar
|
||||||
|
An analyzer using the Morfologik stemming library.
|
||||||
|
|
||||||
common/src/java
|
common/src/java
|
||||||
icu/src/java
|
icu/src/java
|
||||||
phonetic/src/java
|
phonetic/src/java
|
||||||
smartcn/src/java
|
smartcn/src/java
|
||||||
stempel/src/java
|
stempel/src/java
|
||||||
|
morfologik/src/java
|
||||||
The source code for the ffve libraries.
|
The source code for the ffve libraries.
|
||||||
|
|
||||||
common/src/test
|
common/src/test
|
||||||
|
@ -47,4 +51,5 @@ icu/src/test
|
||||||
phonetic/src/test
|
phonetic/src/test
|
||||||
smartcn/src/test
|
smartcn/src/test
|
||||||
stempel/src/test
|
stempel/src/test
|
||||||
|
morfologik/src/test
|
||||||
Unit tests for the five libraries.
|
Unit tests for the five libraries.
|
||||||
|
|
|
@ -25,6 +25,7 @@
|
||||||
- icu: Analyzers that use functionality from ICU
|
- icu: Analyzers that use functionality from ICU
|
||||||
- smartcn: Smart Analyzer for Simplified Chinese Text
|
- smartcn: Smart Analyzer for Simplified Chinese Text
|
||||||
- stempel: Algorithmic Stemmer for Polish
|
- stempel: Algorithmic Stemmer for Polish
|
||||||
|
- morfologik: Morfologik Stemmer
|
||||||
</description>
|
</description>
|
||||||
|
|
||||||
<target name="common">
|
<target name="common">
|
||||||
|
@ -47,8 +48,12 @@
|
||||||
<ant dir="stempel" />
|
<ant dir="stempel" />
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
|
<target name="morfologik">
|
||||||
|
<ant dir="morfologik" />
|
||||||
|
</target>
|
||||||
|
|
||||||
<target name="default" depends="compile"/>
|
<target name="default" depends="compile"/>
|
||||||
<target name="compile" depends="common,icu,phonetic,smartcn,stempel" />
|
<target name="compile" depends="common,icu,phonetic,smartcn,stempel,morfologik" />
|
||||||
|
|
||||||
<target name="clean">
|
<target name="clean">
|
||||||
<ant dir="common" target="clean" />
|
<ant dir="common" target="clean" />
|
||||||
|
@ -56,6 +61,7 @@
|
||||||
<ant dir="phonetic" target="clean" />
|
<ant dir="phonetic" target="clean" />
|
||||||
<ant dir="smartcn" target="clean" />
|
<ant dir="smartcn" target="clean" />
|
||||||
<ant dir="stempel" target="clean" />
|
<ant dir="stempel" target="clean" />
|
||||||
|
<ant dir="morfologik" target="clean" />
|
||||||
</target>
|
</target>
|
||||||
<target name="validate">
|
<target name="validate">
|
||||||
<ant dir="common" target="validate" />
|
<ant dir="common" target="validate" />
|
||||||
|
@ -63,6 +69,7 @@
|
||||||
<ant dir="phonetic" target="validate" />
|
<ant dir="phonetic" target="validate" />
|
||||||
<ant dir="smartcn" target="validate" />
|
<ant dir="smartcn" target="validate" />
|
||||||
<ant dir="stempel" target="validate" />
|
<ant dir="stempel" target="validate" />
|
||||||
|
<ant dir="morfologik" target="validate" />
|
||||||
</target>
|
</target>
|
||||||
<target name="compile-core">
|
<target name="compile-core">
|
||||||
<ant dir="common" target="compile-core" />
|
<ant dir="common" target="compile-core" />
|
||||||
|
@ -70,6 +77,7 @@
|
||||||
<ant dir="phonetic" target="compile-core" />
|
<ant dir="phonetic" target="compile-core" />
|
||||||
<ant dir="smartcn" target="compile-core" />
|
<ant dir="smartcn" target="compile-core" />
|
||||||
<ant dir="stempel" target="compile-core" />
|
<ant dir="stempel" target="compile-core" />
|
||||||
|
<ant dir="morfologik" target="compile-core" />
|
||||||
</target>
|
</target>
|
||||||
<target name="compile-test">
|
<target name="compile-test">
|
||||||
<ant dir="common" target="compile-test" />
|
<ant dir="common" target="compile-test" />
|
||||||
|
@ -77,6 +85,7 @@
|
||||||
<ant dir="phonetic" target="compile-test" />
|
<ant dir="phonetic" target="compile-test" />
|
||||||
<ant dir="smartcn" target="compile-test" />
|
<ant dir="smartcn" target="compile-test" />
|
||||||
<ant dir="stempel" target="compile-test" />
|
<ant dir="stempel" target="compile-test" />
|
||||||
|
<ant dir="morfologik" target="compile-test" />
|
||||||
</target>
|
</target>
|
||||||
<target name="test">
|
<target name="test">
|
||||||
<ant dir="common" target="test" />
|
<ant dir="common" target="test" />
|
||||||
|
@ -84,6 +93,7 @@
|
||||||
<ant dir="phonetic" target="test" />
|
<ant dir="phonetic" target="test" />
|
||||||
<ant dir="smartcn" target="test" />
|
<ant dir="smartcn" target="test" />
|
||||||
<ant dir="stempel" target="test" />
|
<ant dir="stempel" target="test" />
|
||||||
|
<ant dir="morfologik" target="test" />
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="build-artifacts-and-tests" depends="default,compile-test" />
|
<target name="build-artifacts-and-tests" depends="default,compile-test" />
|
||||||
|
@ -94,6 +104,7 @@
|
||||||
<ant dir="phonetic" target="dist-maven" />
|
<ant dir="phonetic" target="dist-maven" />
|
||||||
<ant dir="smartcn" target="dist-maven" />
|
<ant dir="smartcn" target="dist-maven" />
|
||||||
<ant dir="stempel" target="dist-maven" />
|
<ant dir="stempel" target="dist-maven" />
|
||||||
|
<ant dir="morfologik" target="dist-maven" />
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="javadocs">
|
<target name="javadocs">
|
||||||
|
@ -102,6 +113,7 @@
|
||||||
<ant dir="phonetic" target="javadocs" />
|
<ant dir="phonetic" target="javadocs" />
|
||||||
<ant dir="smartcn" target="javadocs" />
|
<ant dir="smartcn" target="javadocs" />
|
||||||
<ant dir="stempel" target="javadocs" />
|
<ant dir="stempel" target="javadocs" />
|
||||||
|
<ant dir="morfologik" target="javadocs" />
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="javadocs-index.html">
|
<target name="javadocs-index.html">
|
||||||
|
@ -110,6 +122,7 @@
|
||||||
<ant dir="phonetic" target="javadocs-index.html" />
|
<ant dir="phonetic" target="javadocs-index.html" />
|
||||||
<ant dir="smartcn" target="javadocs-index.html" />
|
<ant dir="smartcn" target="javadocs-index.html" />
|
||||||
<ant dir="stempel" target="javadocs-index.html" />
|
<ant dir="stempel" target="javadocs-index.html" />
|
||||||
|
<ant dir="morfologik" target="javadocs-index.html" />
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -0,0 +1,61 @@
|
||||||
|
<?xml version="1.0"?>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<project name="analyzers-morfologik" default="default">
|
||||||
|
|
||||||
|
<description>
|
||||||
|
Morfologik Analyzer
|
||||||
|
</description>
|
||||||
|
|
||||||
|
<property name="build.dir" location="../build/morfologik" />
|
||||||
|
<property name="dist.dir" location="../dist/morfologik" />
|
||||||
|
|
||||||
|
<path id="additional.dependencies">
|
||||||
|
<fileset dir="lib" includes="morfologik-fsa-*.jar"/>
|
||||||
|
<fileset dir="lib" includes="morfologik-polish-*.jar"/>
|
||||||
|
<fileset dir="lib" includes="morfologik-stemming-*.jar"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<pathconvert property="project.classpath" targetos="unix" refid="additional.dependencies" />
|
||||||
|
|
||||||
|
<import file="../../../lucene/contrib/contrib-build.xml"/>
|
||||||
|
|
||||||
|
<module-uptodate name="analysis/common" jarfile="../build/common/lucene-analyzers-common-${version}.jar"
|
||||||
|
property="analyzers-common.uptodate" classpath.property="analyzers-common.jar"/>
|
||||||
|
|
||||||
|
<path id="classpath">
|
||||||
|
<pathelement path="${analyzers-common.jar}"/>
|
||||||
|
<path refid="base.classpath"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<path id="test.classpath">
|
||||||
|
<path refid="classpath"/>
|
||||||
|
<pathelement location="../../../lucene/build/classes/test-framework/"/>
|
||||||
|
<pathelement location="../../../lucene/build/classes/test/"/>
|
||||||
|
<path refid="junit-path"/>
|
||||||
|
<pathelement location="${build.dir}/classes/java"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<target name="compile-core" depends="build-analyzers-common, common.compile-core" />
|
||||||
|
|
||||||
|
<target name="build-analyzers-common" unless="analyzers-common.uptodate">
|
||||||
|
<echo>Morfologik building dependency ${analyzers-common.jar}</echo>
|
||||||
|
<ant antfile="../common/build.xml" target="default" inheritall="false" dir="../common" />
|
||||||
|
</target>
|
||||||
|
</project>
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[34c0f34e37062f29497e87325b5124a033747cd5] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -0,0 +1,29 @@
|
||||||
|
|
||||||
|
Copyright (c) 2006 Dawid Weiss
|
||||||
|
Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer in the documentation
|
||||||
|
and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Morfologik nor the names of its contributors
|
||||||
|
may be used to endorse or promote products derived from this software
|
||||||
|
without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||||
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1,2 @@
|
||||||
|
This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
|
||||||
|
(http://morfologik.blogspot.com/).
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[ca2fa4d318ab91d6878614b3479628bf4325bf2e] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -0,0 +1,41 @@
|
||||||
|
morfologik-polish, TERMS OF LICENCE
|
||||||
|
|
||||||
|
This JAR contains and makes use of data from Polish ispell/myspell
|
||||||
|
dictionaries hosted at http://www.sjp.pl/slownik/en/ and is
|
||||||
|
licenced on the terms of (inter alia) LGPL or Creative Commons ShareAlike licenses.
|
||||||
|
|
||||||
|
Part-of-speech tags were added in Morfologik project and are not found
|
||||||
|
in the data from sjp.pl.
|
||||||
|
|
||||||
|
|
||||||
|
BSD-licensed dictionary of Polish (SGJP)
|
||||||
|
http://sgjp.pl/morfeusz/
|
||||||
|
|
||||||
|
Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński,
|
||||||
|
Marcin Woliński, Robert Wołosz
|
||||||
|
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
|
||||||
|
OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||||
|
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||||
|
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||||
|
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||||
|
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1,8 @@
|
||||||
|
This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
|
||||||
|
(http://morfologik.blogspot.com/).
|
||||||
|
|
||||||
|
This product includes data from Polish ispell/myspell dictionary (http://www.sjp.pl/slownik/en/)
|
||||||
|
licenced on the terms of (inter alia) LGPL and Creative Commons ShareAlike.
|
||||||
|
|
||||||
|
This product includes data from BSD-licensed dictionary of Polish (SGJP)
|
||||||
|
(http://sgjp.pl/morfeusz/)
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[dec8226eaaa3b4a3683e7cbdbe0e526dcfffebff] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -0,0 +1,29 @@
|
||||||
|
|
||||||
|
Copyright (c) 2006 Dawid Weiss
|
||||||
|
Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer in the documentation
|
||||||
|
and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Morfologik nor the names of its contributors
|
||||||
|
may be used to endorse or promote products derived from this software
|
||||||
|
without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||||
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1,2 @@
|
||||||
|
This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
|
||||||
|
(http://morfologik.blogspot.com/).
|
|
@ -0,0 +1,84 @@
|
||||||
|
// -*- c-basic-offset: 2 -*-
|
||||||
|
package org.apache.lucene.analysis.morfologik;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
import morfologik.stemming.PolishStemmer.DICTIONARY;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link org.apache.lucene.analysis.Analyzer} using Morfologik library.
|
||||||
|
* @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
|
||||||
|
*/
|
||||||
|
public class MorfologikAnalyzer extends ReusableAnalyzerBase {
|
||||||
|
|
||||||
|
private final DICTIONARY dictionary;
|
||||||
|
private final Version version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer for a given PolishStemmer.DICTIONARY enum.
|
||||||
|
*
|
||||||
|
* @param vers
|
||||||
|
* lucene compatibility version
|
||||||
|
* @param dict
|
||||||
|
* A constant specifying which dictionary to choose. See the
|
||||||
|
* Morfologik documentation for details or use the default.
|
||||||
|
*/
|
||||||
|
public MorfologikAnalyzer(final Version vers, final DICTIONARY dict) {
|
||||||
|
this.version = vers;
|
||||||
|
this.dictionary = dict;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer for an original MORFOLOGIK dictionary.
|
||||||
|
*
|
||||||
|
* @param vers lucene compatibility version
|
||||||
|
*/
|
||||||
|
public MorfologikAnalyzer(final Version vers) {
|
||||||
|
this(vers, DICTIONARY.MORFOLOGIK);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a
|
||||||
|
* {@link ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* which tokenizes all the text in the provided {@link Reader}.
|
||||||
|
*
|
||||||
|
* @param field ignored field name
|
||||||
|
* @param reader source of tokens
|
||||||
|
*
|
||||||
|
* @return A
|
||||||
|
* {@link ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* built from an {@link StandardTokenizer} filtered with
|
||||||
|
* {@link StandardFilter} and {@link MorfologikFilter}.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(final String field, final Reader reader) {
|
||||||
|
final Tokenizer src = new StandardTokenizer(this.version, reader);
|
||||||
|
|
||||||
|
return new TokenStreamComponents(
|
||||||
|
src,
|
||||||
|
new MorfologikFilter(new StandardFilter(this.version, src), this.dictionary, this.version));
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,134 @@
|
||||||
|
// -*- c-basic-offset: 2 -*-
|
||||||
|
package org.apache.lucene.analysis.morfologik;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import morfologik.stemming.*;
|
||||||
|
import morfologik.stemming.PolishStemmer.DICTIONARY;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link TokenFilter} using Morfologik library.
|
||||||
|
* @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
|
||||||
|
*
|
||||||
|
* MorfologikFilter contains a {@link MorphosyntacticTagAttribute}, which provides morphosyntactic
|
||||||
|
* annotations for produced lemmas. See the Morfologik documentation for details.
|
||||||
|
*/
|
||||||
|
public class MorfologikFilter extends TokenFilter {
|
||||||
|
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final MorphosyntacticTagAttribute tagAtt = addAttribute(MorphosyntacticTagAttribute.class);
|
||||||
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
|
private final CharsRef scratch = new CharsRef(0);
|
||||||
|
private final CharacterUtils charUtils;
|
||||||
|
|
||||||
|
private State current;
|
||||||
|
private final TokenStream input;
|
||||||
|
private final IStemmer stemmer;
|
||||||
|
|
||||||
|
private List<WordData> lemmaList;
|
||||||
|
private int lemmaListIndex;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds a filter for given PolishStemmer.DICTIONARY enum.
|
||||||
|
*
|
||||||
|
* @param in input token stream
|
||||||
|
* @param dict PolishStemmer.DICTIONARY enum
|
||||||
|
* @param version Lucene version compatibility for lowercasing.
|
||||||
|
*/
|
||||||
|
public MorfologikFilter(final TokenStream in, final DICTIONARY dict, final Version version) {
|
||||||
|
super(in);
|
||||||
|
this.input = in;
|
||||||
|
this.stemmer = new PolishStemmer(dict);
|
||||||
|
this.charUtils = CharacterUtils.getInstance(version);
|
||||||
|
this.lemmaList = Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void popNextLemma() {
|
||||||
|
final WordData lemma = lemmaList.get(lemmaListIndex++);
|
||||||
|
termAtt.setEmpty().append(lemma.getStem());
|
||||||
|
tagAtt.setTag(lemma.getTag());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lookup a given surface form of a token and update
|
||||||
|
* {@link #lemmaList} and {@link #lemmaListIndex} accordingly.
|
||||||
|
*/
|
||||||
|
private boolean lookupSurfaceForm(CharSequence token) {
|
||||||
|
lemmaList = this.stemmer.lookup(token);
|
||||||
|
lemmaListIndex = 0;
|
||||||
|
return lemmaList.size() > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Retrieves the next token (possibly from the list of lemmas). */
|
||||||
|
@Override
|
||||||
|
public final boolean incrementToken() throws IOException {
|
||||||
|
if (lemmaListIndex < lemmaList.size()) {
|
||||||
|
restoreState(current);
|
||||||
|
posIncrAtt.setPositionIncrement(0);
|
||||||
|
popNextLemma();
|
||||||
|
return true;
|
||||||
|
} else if (this.input.incrementToken()) {
|
||||||
|
if (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt))) {
|
||||||
|
current = captureState();
|
||||||
|
popNextLemma();
|
||||||
|
} else {
|
||||||
|
tagAtt.clear();
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert to lowercase in-place.
|
||||||
|
*/
|
||||||
|
private CharSequence toLowercase(CharSequence chs) {
|
||||||
|
final int length = scratch.length = chs.length();
|
||||||
|
scratch.grow(length);
|
||||||
|
|
||||||
|
char buffer[] = scratch.chars;
|
||||||
|
for (int i = 0; i < length;) {
|
||||||
|
i += Character.toChars(
|
||||||
|
Character.toLowerCase(charUtils.codePointAt(chs, i)), buffer, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
return scratch;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Resets stems accumulator and hands over to superclass. */
|
||||||
|
@Override
|
||||||
|
public void reset() throws IOException {
|
||||||
|
lemmaListIndex = 0;
|
||||||
|
lemmaList = Collections.emptyList();
|
||||||
|
super.reset();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,40 @@
|
||||||
|
// -*- c-basic-offset: 2 -*-
|
||||||
|
package org.apache.lucene.analysis.morfologik;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.util.Attribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Morfologik dictionaries provide morphosyntactic annotations for
|
||||||
|
* surface forms. For the exact format and description of these,
|
||||||
|
* see the project's documentation (annotations vary by dictionary!).
|
||||||
|
*/
|
||||||
|
public interface MorphosyntacticTagAttribute extends Attribute {
|
||||||
|
/**
|
||||||
|
* Set the POS tag. The default value (no-value) is null.
|
||||||
|
* @param pos POS tag corresponding to current lemma
|
||||||
|
*/
|
||||||
|
public void setTag(CharSequence pos);
|
||||||
|
|
||||||
|
/** Returns the POS tag of the term. */
|
||||||
|
public CharSequence getTag();
|
||||||
|
|
||||||
|
/** Clear to default value. */
|
||||||
|
public void clear();
|
||||||
|
}
|
|
@ -0,0 +1,91 @@
|
||||||
|
// -*- c-basic-offset: 2 -*-
|
||||||
|
package org.apache.lucene.analysis.morfologik;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.util.AttributeImpl;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @see MorphosyntacticTagAttribute
|
||||||
|
*/
|
||||||
|
public class MorphosyntacticTagAttributeImpl extends AttributeImpl
|
||||||
|
implements MorphosyntacticTagAttribute, Cloneable {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Either the original tag from WordData or a clone.
|
||||||
|
*/
|
||||||
|
private CharSequence tag;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the tag.
|
||||||
|
*/
|
||||||
|
public void setTag(CharSequence pos) {
|
||||||
|
this.tag = ((pos == null || pos.length() == 0) ? null : pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the POS tag of the term. If you need a copy of this char sequence, clone it
|
||||||
|
* because it may change with each new term!
|
||||||
|
*/
|
||||||
|
public CharSequence getTag() {
|
||||||
|
return tag;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void clear() {
|
||||||
|
tag = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if (other instanceof MorphosyntacticTagAttribute) {
|
||||||
|
return equal(this.getTag(), ((MorphosyntacticTagAttribute) other).getTag());
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if two char sequences are the same.
|
||||||
|
*/
|
||||||
|
private boolean equal(CharSequence chs1, CharSequence chs2) {
|
||||||
|
if (chs1 == null && chs2 == null)
|
||||||
|
return true;
|
||||||
|
if (chs1 == null || chs2 == null)
|
||||||
|
return false;
|
||||||
|
int l1 = chs1.length();
|
||||||
|
int l2 = chs2.length();
|
||||||
|
if (l1 != l2)
|
||||||
|
return false;
|
||||||
|
for (int i = 0; i < l1; i++)
|
||||||
|
if (chs1.charAt(i) != chs2.charAt(i))
|
||||||
|
return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int hashCode() {
|
||||||
|
return this.tag == null ? 0 : tag.hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void copyTo(AttributeImpl target) {
|
||||||
|
((MorphosyntacticTagAttribute) target).setTag(this.tag);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object clone() {
|
||||||
|
MorphosyntacticTagAttributeImpl cloned = new MorphosyntacticTagAttributeImpl();
|
||||||
|
cloned.tag = (tag == null ? null : tag.toString());
|
||||||
|
return cloned;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,34 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
This package provides dictionary-driven lemmatization ("accurate stemming")
|
||||||
|
filter and analyzer for the Polish Language, driven by the
|
||||||
|
<a href="http://morfologik.blogspot.com/">Morfologik library</a> developed
|
||||||
|
by Dawid Weiss and Marcin Miłkowski.
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
The MorfologikFilter yields one or more terms for each token. Each
|
||||||
|
of those terms is given the same position in the index.
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,34 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
This package provides dictionary-driven lemmatization ("accurate stemming")
|
||||||
|
filter and analyzer for the Polish Language, driven by the
|
||||||
|
<a href="http://morfologik.blogspot.com/">Morfologik library</a> developed
|
||||||
|
by Dawid Weiss and Marcin Miłkowski.
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
The MorfologikFilter yields one or more terms for each token. Each
|
||||||
|
of those terms is given the same position in the index.
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,105 @@
|
||||||
|
// -*- c-basic-offset: 2 -*-
|
||||||
|
package org.apache.lucene.analysis.morfologik;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* TODO: The tests below rely on the order of returned lemmas, which is probably not good.
|
||||||
|
*/
|
||||||
|
public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
private Analyzer getTestAnalyzer() {
|
||||||
|
return new MorfologikAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test stemming of single tokens with Morfologik library. */
|
||||||
|
public final void testSingleTokens() throws IOException {
|
||||||
|
Analyzer a = getTestAnalyzer();
|
||||||
|
assertAnalyzesToReuse(a, "a", new String[] { "a" });
|
||||||
|
assertAnalyzesToReuse(a, "liście", new String[] { "liść", "list", "lista", });
|
||||||
|
assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dane", "dać" });
|
||||||
|
assertAnalyzesToReuse(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test stemming of multiple tokens and proper term metrics. */
|
||||||
|
public final void testMultipleTokens() throws IOException {
|
||||||
|
Analyzer a = getTestAnalyzer();
|
||||||
|
assertAnalyzesToReuse(
|
||||||
|
a,
|
||||||
|
"liście danych",
|
||||||
|
new String[] { "liść", "list", "lista", "dany", "dane", "dać" },
|
||||||
|
new int[] { 0, 0, 0, 7, 7, 7 },
|
||||||
|
new int[] { 6, 6, 6, 13, 13, 13 },
|
||||||
|
new int[] { 1, 0, 0, 1, 0, 0 });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test reuse of MorfologikFilter with leftover stems. */
|
||||||
|
public final void testLeftoverStems() throws IOException {
|
||||||
|
Analyzer a = getTestAnalyzer();
|
||||||
|
TokenStream ts_1 = a.reusableTokenStream("dummy", new StringReader("liście"));
|
||||||
|
CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
|
||||||
|
ts_1.reset();
|
||||||
|
ts_1.incrementToken();
|
||||||
|
assertEquals("first stream", "liść", termAtt_1.toString());
|
||||||
|
|
||||||
|
TokenStream ts_2 = a.reusableTokenStream("dummy", new StringReader("danych"));
|
||||||
|
CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
|
||||||
|
ts_2.reset();
|
||||||
|
ts_2.incrementToken();
|
||||||
|
assertEquals("second stream", "dany", termAtt_2.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test stemming of mixed-case tokens. */
|
||||||
|
public final void testCase() throws IOException {
|
||||||
|
Analyzer a = getTestAnalyzer();
|
||||||
|
|
||||||
|
assertAnalyzesToReuse(a, "AGD", new String[] { "artykuły gospodarstwa domowego" });
|
||||||
|
assertAnalyzesToReuse(a, "agd", new String[] { "artykuły gospodarstwa domowego" });
|
||||||
|
|
||||||
|
assertAnalyzesToReuse(a, "Poznania", new String[] { "Poznań" });
|
||||||
|
assertAnalyzesToReuse(a, "poznania", new String[] { "poznać" });
|
||||||
|
|
||||||
|
assertAnalyzesToReuse(a, "Aarona", new String[] { "Aaron" });
|
||||||
|
assertAnalyzesToReuse(a, "aarona", new String[] { "aarona" });
|
||||||
|
|
||||||
|
assertAnalyzesToReuse(a, "Liście", new String[] { "liść", "list", "lista" });
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertPOSToken(TokenStream ts, String term, String pos) throws IOException {
|
||||||
|
ts.incrementToken();
|
||||||
|
assertEquals(term, ts.getAttribute(CharTermAttribute.class).toString());
|
||||||
|
assertEquals(pos, ts.getAttribute(MorphosyntacticTagAttribute.class).getTag().toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test morphosyntactic annotations. */
|
||||||
|
public final void testPOSAttribute() throws IOException {
|
||||||
|
TokenStream ts = getTestAnalyzer().reusableTokenStream("dummy", new StringReader("liście"));
|
||||||
|
|
||||||
|
assertPOSToken(ts, "liść", "subst:pl:acc.nom.voc:m3");
|
||||||
|
assertPOSToken(ts, "list", "subst:sg:loc.voc:m3");
|
||||||
|
assertPOSToken(ts, "lista", "subst:sg:dat.loc:f");
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue