mirror of https://github.com/apache/lucene.git
LUCENE-2341: integrating morfologik (Polish stemming/ morphosyntactic dictionary) as an analysis module.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1141022 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
216e696809
commit
d188d3df90
|
@ -38,14 +38,15 @@
|
|||
<classpathentry kind="src" path="modules/analysis/stempel/src/java"/>
|
||||
<classpathentry kind="src" path="modules/analysis/stempel/src/resources"/>
|
||||
<classpathentry kind="src" path="modules/analysis/stempel/src/test"/>
|
||||
<classpathentry kind="src" path="modules/analysis/morfologik/src/java"/>
|
||||
<classpathentry kind="src" path="modules/analysis/morfologik/src/test"/>
|
||||
<classpathentry kind="src" path="modules/benchmark/src/java"/>
|
||||
<classpathentry kind="src" path="modules/benchmark/src/test"/>
|
||||
<classpathentry kind="src" path="modules/common/src/java"/>
|
||||
<classpathentry kind="src" path="modules/common/src/test"/>
|
||||
<classpathentry kind="src" path="modules/grouping/src/java"/>
|
||||
<classpathentry kind="src" path="modules/grouping/src/test"/>
|
||||
<classpathentry kind="src" path="modules/queries/src/java"/>
|
||||
<classpathentry kind="src" path="modules/queries/src/test"/>
|
||||
<classpathentry kind="src" path="modules/queries/src/java"/>
|
||||
<classpathentry kind="src" path="modules/suggest/src/java"/>
|
||||
<classpathentry kind="src" path="modules/suggest/src/test"/>
|
||||
<classpathentry kind="src" path="solr/src/java"/>
|
||||
|
@ -80,6 +81,9 @@
|
|||
<classpathentry kind="lib" path="lucene/contrib/queries/lib/jakarta-regexp-1.4.jar"/>
|
||||
<classpathentry kind="lib" path="modules/analysis/icu/lib/icu4j-4_8.jar"/>
|
||||
<classpathentry kind="lib" path="modules/analysis/phonetic/lib/commons-codec-1.4.jar"/>
|
||||
<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar"/>
|
||||
<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-polish-1.5.2.jar"/>
|
||||
<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar"/>
|
||||
<classpathentry kind="lib" path="modules/benchmark/lib/commons-beanutils-1.7.0.jar"/>
|
||||
<classpathentry kind="lib" path="modules/benchmark/lib/commons-collections-3.1.jar"/>
|
||||
<classpathentry kind="lib" path="modules/benchmark/lib/commons-compress-1.1.jar"/>
|
||||
|
|
|
@ -33,7 +33,11 @@ API Changes
|
|||
in half. (Robert Muir)
|
||||
|
||||
New Features
|
||||
|
||||
|
||||
* LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer
|
||||
(accurate stemmer) for Polish (includes morphosyntactic annotations).
|
||||
(Michał Dybizbański, Dawid Weiss)
|
||||
|
||||
* LUCENE-2413: Consolidated Solr analysis components into common.
|
||||
New features from Solr now available to Lucene users include:
|
||||
- o.a.l.analysis.commongrams: Constructs n-grams for frequently occurring terms
|
||||
|
|
|
@ -263,3 +263,80 @@ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|||
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGE.
|
||||
|
||||
The following license applies to the Morfologik project:
|
||||
|
||||
Copyright (c) 2006 Dawid Weiss
|
||||
Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Morfologik nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this software
|
||||
without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
|
||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
---
|
||||
|
||||
The dictionary comes from Morfologik project. Morfologik uses data from
|
||||
Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and
|
||||
is licenced on the terms of (inter alia) LGPL and Creative Commons
|
||||
ShareAlike. The part-of-speech tags were added in Morfologik project and
|
||||
are not found in the data from sjp.pl. The tagset is similar to IPI PAN
|
||||
tagset.
|
||||
|
||||
---
|
||||
|
||||
The following license applies to the Morfeusz project,
|
||||
used by org.apache.lucene.analysis.morfologik.
|
||||
|
||||
BSD-licensed dictionary of Polish (SGJP)
|
||||
http://sgjp.pl/morfeusz/
|
||||
|
||||
Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński,
|
||||
Marcin Woliński, Robert Wołosz
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
|
||||
OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
|
|
@ -62,3 +62,12 @@ WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/)
|
|||
is derived from Unicode data such as the Unicode Character Database.
|
||||
See http://unicode.org/copyright.html for more details.
|
||||
|
||||
The Morfologik analyzer (morfologik) includes BSD-licensed software
|
||||
developed by Dawid Weiss and Marcin Miłkowski (http://morfologik.blogspot.com/).
|
||||
|
||||
Morfologik uses data from Polish ispell/myspell dictionary
|
||||
(http://www.sjp.pl/slownik/en/) licenced on the terms of (inter alia)
|
||||
LGPL and Creative Commons ShareAlike.
|
||||
|
||||
Morfologic includes data from BSD-licensed dictionary of Polish (SGJP)
|
||||
(http://sgjp.pl/morfeusz/)
|
||||
|
|
|
@ -35,11 +35,15 @@ lucene-analyzers-stempel-XX.jar
|
|||
An add-on analysis library that contains a universal algorithmic stemmer,
|
||||
including tables for the Polish language.
|
||||
|
||||
lucene-analyzers-morfologik-XX.jar
|
||||
An analyzer using the Morfologik stemming library.
|
||||
|
||||
common/src/java
|
||||
icu/src/java
|
||||
phonetic/src/java
|
||||
smartcn/src/java
|
||||
stempel/src/java
|
||||
morfologik/src/java
|
||||
The source code for the ffve libraries.
|
||||
|
||||
common/src/test
|
||||
|
@ -47,4 +51,5 @@ icu/src/test
|
|||
phonetic/src/test
|
||||
smartcn/src/test
|
||||
stempel/src/test
|
||||
morfologik/src/test
|
||||
Unit tests for the five libraries.
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
- icu: Analyzers that use functionality from ICU
|
||||
- smartcn: Smart Analyzer for Simplified Chinese Text
|
||||
- stempel: Algorithmic Stemmer for Polish
|
||||
- morfologik: Morfologik Stemmer
|
||||
</description>
|
||||
|
||||
<target name="common">
|
||||
|
@ -47,8 +48,12 @@
|
|||
<ant dir="stempel" />
|
||||
</target>
|
||||
|
||||
<target name="morfologik">
|
||||
<ant dir="morfologik" />
|
||||
</target>
|
||||
|
||||
<target name="default" depends="compile"/>
|
||||
<target name="compile" depends="common,icu,phonetic,smartcn,stempel" />
|
||||
<target name="compile" depends="common,icu,phonetic,smartcn,stempel,morfologik" />
|
||||
|
||||
<target name="clean">
|
||||
<ant dir="common" target="clean" />
|
||||
|
@ -56,6 +61,7 @@
|
|||
<ant dir="phonetic" target="clean" />
|
||||
<ant dir="smartcn" target="clean" />
|
||||
<ant dir="stempel" target="clean" />
|
||||
<ant dir="morfologik" target="clean" />
|
||||
</target>
|
||||
<target name="validate">
|
||||
<ant dir="common" target="validate" />
|
||||
|
@ -63,6 +69,7 @@
|
|||
<ant dir="phonetic" target="validate" />
|
||||
<ant dir="smartcn" target="validate" />
|
||||
<ant dir="stempel" target="validate" />
|
||||
<ant dir="morfologik" target="validate" />
|
||||
</target>
|
||||
<target name="compile-core">
|
||||
<ant dir="common" target="compile-core" />
|
||||
|
@ -70,6 +77,7 @@
|
|||
<ant dir="phonetic" target="compile-core" />
|
||||
<ant dir="smartcn" target="compile-core" />
|
||||
<ant dir="stempel" target="compile-core" />
|
||||
<ant dir="morfologik" target="compile-core" />
|
||||
</target>
|
||||
<target name="compile-test">
|
||||
<ant dir="common" target="compile-test" />
|
||||
|
@ -77,6 +85,7 @@
|
|||
<ant dir="phonetic" target="compile-test" />
|
||||
<ant dir="smartcn" target="compile-test" />
|
||||
<ant dir="stempel" target="compile-test" />
|
||||
<ant dir="morfologik" target="compile-test" />
|
||||
</target>
|
||||
<target name="test">
|
||||
<ant dir="common" target="test" />
|
||||
|
@ -84,6 +93,7 @@
|
|||
<ant dir="phonetic" target="test" />
|
||||
<ant dir="smartcn" target="test" />
|
||||
<ant dir="stempel" target="test" />
|
||||
<ant dir="morfologik" target="test" />
|
||||
</target>
|
||||
|
||||
<target name="build-artifacts-and-tests" depends="default,compile-test" />
|
||||
|
@ -94,6 +104,7 @@
|
|||
<ant dir="phonetic" target="dist-maven" />
|
||||
<ant dir="smartcn" target="dist-maven" />
|
||||
<ant dir="stempel" target="dist-maven" />
|
||||
<ant dir="morfologik" target="dist-maven" />
|
||||
</target>
|
||||
|
||||
<target name="javadocs">
|
||||
|
@ -102,6 +113,7 @@
|
|||
<ant dir="phonetic" target="javadocs" />
|
||||
<ant dir="smartcn" target="javadocs" />
|
||||
<ant dir="stempel" target="javadocs" />
|
||||
<ant dir="morfologik" target="javadocs" />
|
||||
</target>
|
||||
|
||||
<target name="javadocs-index.html">
|
||||
|
@ -110,6 +122,7 @@
|
|||
<ant dir="phonetic" target="javadocs-index.html" />
|
||||
<ant dir="smartcn" target="javadocs-index.html" />
|
||||
<ant dir="stempel" target="javadocs-index.html" />
|
||||
<ant dir="morfologik" target="javadocs-index.html" />
|
||||
</target>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="analyzers-morfologik" default="default">
|
||||
|
||||
<description>
|
||||
Morfologik Analyzer
|
||||
</description>
|
||||
|
||||
<property name="build.dir" location="../build/morfologik" />
|
||||
<property name="dist.dir" location="../dist/morfologik" />
|
||||
|
||||
<path id="additional.dependencies">
|
||||
<fileset dir="lib" includes="morfologik-fsa-*.jar"/>
|
||||
<fileset dir="lib" includes="morfologik-polish-*.jar"/>
|
||||
<fileset dir="lib" includes="morfologik-stemming-*.jar"/>
|
||||
</path>
|
||||
|
||||
<pathconvert property="project.classpath" targetos="unix" refid="additional.dependencies" />
|
||||
|
||||
<import file="../../../lucene/contrib/contrib-build.xml"/>
|
||||
|
||||
<module-uptodate name="analysis/common" jarfile="../build/common/lucene-analyzers-common-${version}.jar"
|
||||
property="analyzers-common.uptodate" classpath.property="analyzers-common.jar"/>
|
||||
|
||||
<path id="classpath">
|
||||
<pathelement path="${analyzers-common.jar}"/>
|
||||
<path refid="base.classpath"/>
|
||||
</path>
|
||||
|
||||
<path id="test.classpath">
|
||||
<path refid="classpath"/>
|
||||
<pathelement location="../../../lucene/build/classes/test-framework/"/>
|
||||
<pathelement location="../../../lucene/build/classes/test/"/>
|
||||
<path refid="junit-path"/>
|
||||
<pathelement location="${build.dir}/classes/java"/>
|
||||
</path>
|
||||
|
||||
<target name="compile-core" depends="build-analyzers-common, common.compile-core" />
|
||||
|
||||
<target name="build-analyzers-common" unless="analyzers-common.uptodate">
|
||||
<echo>Morfologik building dependency ${analyzers-common.jar}</echo>
|
||||
<ant antfile="../common/build.xml" target="default" inheritall="false" dir="../common" />
|
||||
</target>
|
||||
</project>
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[34c0f34e37062f29497e87325b5124a033747cd5] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,29 @@
|
|||
|
||||
Copyright (c) 2006 Dawid Weiss
|
||||
Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Morfologik nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this software
|
||||
without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
|
||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1,2 @@
|
|||
This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
|
||||
(http://morfologik.blogspot.com/).
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[ca2fa4d318ab91d6878614b3479628bf4325bf2e] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,41 @@
|
|||
morfologik-polish, TERMS OF LICENCE
|
||||
|
||||
This JAR contains and makes use of data from Polish ispell/myspell
|
||||
dictionaries hosted at http://www.sjp.pl/slownik/en/ and is
|
||||
licenced on the terms of (inter alia) LGPL or Creative Commons ShareAlike licenses.
|
||||
|
||||
Part-of-speech tags were added in Morfologik project and are not found
|
||||
in the data from sjp.pl.
|
||||
|
||||
|
||||
BSD-licensed dictionary of Polish (SGJP)
|
||||
http://sgjp.pl/morfeusz/
|
||||
|
||||
Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński,
|
||||
Marcin Woliński, Robert Wołosz
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
|
||||
OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1,8 @@
|
|||
This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
|
||||
(http://morfologik.blogspot.com/).
|
||||
|
||||
This product includes data from Polish ispell/myspell dictionary (http://www.sjp.pl/slownik/en/)
|
||||
licenced on the terms of (inter alia) LGPL and Creative Commons ShareAlike.
|
||||
|
||||
This product includes data from BSD-licensed dictionary of Polish (SGJP)
|
||||
(http://sgjp.pl/morfeusz/)
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[dec8226eaaa3b4a3683e7cbdbe0e526dcfffebff] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,29 @@
|
|||
|
||||
Copyright (c) 2006 Dawid Weiss
|
||||
Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Morfologik nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this software
|
||||
without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
|
||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1,2 @@
|
|||
This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
|
||||
(http://morfologik.blogspot.com/).
|
|
@ -0,0 +1,84 @@
|
|||
// -*- c-basic-offset: 2 -*-
|
||||
package org.apache.lucene.analysis.morfologik;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import morfologik.stemming.PolishStemmer.DICTIONARY;
|
||||
|
||||
/**
|
||||
* {@link org.apache.lucene.analysis.Analyzer} using Morfologik library.
|
||||
* @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
|
||||
*/
|
||||
public class MorfologikAnalyzer extends ReusableAnalyzerBase {
|
||||
|
||||
private final DICTIONARY dictionary;
|
||||
private final Version version;
|
||||
|
||||
/**
|
||||
* Builds an analyzer for a given PolishStemmer.DICTIONARY enum.
|
||||
*
|
||||
* @param vers
|
||||
* lucene compatibility version
|
||||
* @param dict
|
||||
* A constant specifying which dictionary to choose. See the
|
||||
* Morfologik documentation for details or use the default.
|
||||
*/
|
||||
public MorfologikAnalyzer(final Version vers, final DICTIONARY dict) {
|
||||
this.version = vers;
|
||||
this.dictionary = dict;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer for an original MORFOLOGIK dictionary.
|
||||
*
|
||||
* @param vers lucene compatibility version
|
||||
*/
|
||||
public MorfologikAnalyzer(final Version vers) {
|
||||
this(vers, DICTIONARY.MORFOLOGIK);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a
|
||||
* {@link ReusableAnalyzerBase.TokenStreamComponents}
|
||||
* which tokenizes all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @param field ignored field name
|
||||
* @param reader source of tokens
|
||||
*
|
||||
* @return A
|
||||
* {@link ReusableAnalyzerBase.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter} and {@link MorfologikFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(final String field, final Reader reader) {
|
||||
final Tokenizer src = new StandardTokenizer(this.version, reader);
|
||||
|
||||
return new TokenStreamComponents(
|
||||
src,
|
||||
new MorfologikFilter(new StandardFilter(this.version, src), this.dictionary, this.version));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,134 @@
|
|||
// -*- c-basic-offset: 2 -*-
|
||||
package org.apache.lucene.analysis.morfologik;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import morfologik.stemming.*;
|
||||
import morfologik.stemming.PolishStemmer.DICTIONARY;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link TokenFilter} using Morfologik library.
|
||||
* @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
|
||||
*
|
||||
* MorfologikFilter contains a {@link MorphosyntacticTagAttribute}, which provides morphosyntactic
|
||||
* annotations for produced lemmas. See the Morfologik documentation for details.
|
||||
*/
|
||||
public class MorfologikFilter extends TokenFilter {
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final MorphosyntacticTagAttribute tagAtt = addAttribute(MorphosyntacticTagAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
private final CharsRef scratch = new CharsRef(0);
|
||||
private final CharacterUtils charUtils;
|
||||
|
||||
private State current;
|
||||
private final TokenStream input;
|
||||
private final IStemmer stemmer;
|
||||
|
||||
private List<WordData> lemmaList;
|
||||
private int lemmaListIndex;
|
||||
|
||||
/**
|
||||
* Builds a filter for given PolishStemmer.DICTIONARY enum.
|
||||
*
|
||||
* @param in input token stream
|
||||
* @param dict PolishStemmer.DICTIONARY enum
|
||||
* @param version Lucene version compatibility for lowercasing.
|
||||
*/
|
||||
public MorfologikFilter(final TokenStream in, final DICTIONARY dict, final Version version) {
|
||||
super(in);
|
||||
this.input = in;
|
||||
this.stemmer = new PolishStemmer(dict);
|
||||
this.charUtils = CharacterUtils.getInstance(version);
|
||||
this.lemmaList = Collections.emptyList();
|
||||
}
|
||||
|
||||
private void popNextLemma() {
|
||||
final WordData lemma = lemmaList.get(lemmaListIndex++);
|
||||
termAtt.setEmpty().append(lemma.getStem());
|
||||
tagAtt.setTag(lemma.getTag());
|
||||
}
|
||||
|
||||
/**
|
||||
* Lookup a given surface form of a token and update
|
||||
* {@link #lemmaList} and {@link #lemmaListIndex} accordingly.
|
||||
*/
|
||||
private boolean lookupSurfaceForm(CharSequence token) {
|
||||
lemmaList = this.stemmer.lookup(token);
|
||||
lemmaListIndex = 0;
|
||||
return lemmaList.size() > 0;
|
||||
}
|
||||
|
||||
/** Retrieves the next token (possibly from the list of lemmas). */
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (lemmaListIndex < lemmaList.size()) {
|
||||
restoreState(current);
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
popNextLemma();
|
||||
return true;
|
||||
} else if (this.input.incrementToken()) {
|
||||
if (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt))) {
|
||||
current = captureState();
|
||||
popNextLemma();
|
||||
} else {
|
||||
tagAtt.clear();
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert to lowercase in-place.
|
||||
*/
|
||||
private CharSequence toLowercase(CharSequence chs) {
|
||||
final int length = scratch.length = chs.length();
|
||||
scratch.grow(length);
|
||||
|
||||
char buffer[] = scratch.chars;
|
||||
for (int i = 0; i < length;) {
|
||||
i += Character.toChars(
|
||||
Character.toLowerCase(charUtils.codePointAt(chs, i)), buffer, i);
|
||||
}
|
||||
|
||||
return scratch;
|
||||
}
|
||||
|
||||
/** Resets stems accumulator and hands over to superclass. */
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
lemmaListIndex = 0;
|
||||
lemmaList = Collections.emptyList();
|
||||
super.reset();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
// -*- c-basic-offset: 2 -*-
|
||||
package org.apache.lucene.analysis.morfologik;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/**
|
||||
* Morfologik dictionaries provide morphosyntactic annotations for
|
||||
* surface forms. For the exact format and description of these,
|
||||
* see the project's documentation (annotations vary by dictionary!).
|
||||
*/
|
||||
public interface MorphosyntacticTagAttribute extends Attribute {
|
||||
/**
|
||||
* Set the POS tag. The default value (no-value) is null.
|
||||
* @param pos POS tag corresponding to current lemma
|
||||
*/
|
||||
public void setTag(CharSequence pos);
|
||||
|
||||
/** Returns the POS tag of the term. */
|
||||
public CharSequence getTag();
|
||||
|
||||
/** Clear to default value. */
|
||||
public void clear();
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
// -*- c-basic-offset: 2 -*-
|
||||
package org.apache.lucene.analysis.morfologik;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
* @see MorphosyntacticTagAttribute
|
||||
*/
|
||||
public class MorphosyntacticTagAttributeImpl extends AttributeImpl
|
||||
implements MorphosyntacticTagAttribute, Cloneable {
|
||||
|
||||
/**
|
||||
* Either the original tag from WordData or a clone.
|
||||
*/
|
||||
private CharSequence tag;
|
||||
|
||||
/**
|
||||
* Set the tag.
|
||||
*/
|
||||
public void setTag(CharSequence pos) {
|
||||
this.tag = ((pos == null || pos.length() == 0) ? null : pos);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the POS tag of the term. If you need a copy of this char sequence, clone it
|
||||
* because it may change with each new term!
|
||||
*/
|
||||
public CharSequence getTag() {
|
||||
return tag;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
tag = null;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other instanceof MorphosyntacticTagAttribute) {
|
||||
return equal(this.getTag(), ((MorphosyntacticTagAttribute) other).getTag());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if two char sequences are the same.
|
||||
*/
|
||||
private boolean equal(CharSequence chs1, CharSequence chs2) {
|
||||
if (chs1 == null && chs2 == null)
|
||||
return true;
|
||||
if (chs1 == null || chs2 == null)
|
||||
return false;
|
||||
int l1 = chs1.length();
|
||||
int l2 = chs2.length();
|
||||
if (l1 != l2)
|
||||
return false;
|
||||
for (int i = 0; i < l1; i++)
|
||||
if (chs1.charAt(i) != chs2.charAt(i))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return this.tag == null ? 0 : tag.hashCode();
|
||||
}
|
||||
|
||||
public void copyTo(AttributeImpl target) {
|
||||
((MorphosyntacticTagAttribute) target).setTag(this.tag);
|
||||
}
|
||||
|
||||
public Object clone() {
|
||||
MorphosyntacticTagAttributeImpl cloned = new MorphosyntacticTagAttributeImpl();
|
||||
cloned.tag = (tag == null ? null : tag.toString());
|
||||
return cloned;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
|
||||
</head>
|
||||
<body>
|
||||
<p>
|
||||
This package provides dictionary-driven lemmatization ("accurate stemming")
|
||||
filter and analyzer for the Polish Language, driven by the
|
||||
<a href="http://morfologik.blogspot.com/">Morfologik library</a> developed
|
||||
by Dawid Weiss and Marcin Miłkowski.
|
||||
</p>
|
||||
<p>
|
||||
The MorfologikFilter yields one or more terms for each token. Each
|
||||
of those terms is given the same position in the index.
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,34 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
|
||||
</head>
|
||||
<body>
|
||||
<p>
|
||||
This package provides dictionary-driven lemmatization ("accurate stemming")
|
||||
filter and analyzer for the Polish Language, driven by the
|
||||
<a href="http://morfologik.blogspot.com/">Morfologik library</a> developed
|
||||
by Dawid Weiss and Marcin Miłkowski.
|
||||
</p>
|
||||
<p>
|
||||
The MorfologikFilter yields one or more terms for each token. Each
|
||||
of those terms is given the same position in the index.
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,105 @@
|
|||
// -*- c-basic-offset: 2 -*-
|
||||
package org.apache.lucene.analysis.morfologik;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/**
|
||||
* TODO: The tests below rely on the order of returned lemmas, which is probably not good.
|
||||
*/
|
||||
public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
||||
private Analyzer getTestAnalyzer() {
|
||||
return new MorfologikAnalyzer(TEST_VERSION_CURRENT);
|
||||
}
|
||||
|
||||
/** Test stemming of single tokens with Morfologik library. */
|
||||
public final void testSingleTokens() throws IOException {
|
||||
Analyzer a = getTestAnalyzer();
|
||||
assertAnalyzesToReuse(a, "a", new String[] { "a" });
|
||||
assertAnalyzesToReuse(a, "liście", new String[] { "liść", "list", "lista", });
|
||||
assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dane", "dać" });
|
||||
assertAnalyzesToReuse(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń" });
|
||||
}
|
||||
|
||||
/** Test stemming of multiple tokens and proper term metrics. */
|
||||
public final void testMultipleTokens() throws IOException {
|
||||
Analyzer a = getTestAnalyzer();
|
||||
assertAnalyzesToReuse(
|
||||
a,
|
||||
"liście danych",
|
||||
new String[] { "liść", "list", "lista", "dany", "dane", "dać" },
|
||||
new int[] { 0, 0, 0, 7, 7, 7 },
|
||||
new int[] { 6, 6, 6, 13, 13, 13 },
|
||||
new int[] { 1, 0, 0, 1, 0, 0 });
|
||||
}
|
||||
|
||||
/** Test reuse of MorfologikFilter with leftover stems. */
|
||||
public final void testLeftoverStems() throws IOException {
|
||||
Analyzer a = getTestAnalyzer();
|
||||
TokenStream ts_1 = a.reusableTokenStream("dummy", new StringReader("liście"));
|
||||
CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
|
||||
ts_1.reset();
|
||||
ts_1.incrementToken();
|
||||
assertEquals("first stream", "liść", termAtt_1.toString());
|
||||
|
||||
TokenStream ts_2 = a.reusableTokenStream("dummy", new StringReader("danych"));
|
||||
CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
|
||||
ts_2.reset();
|
||||
ts_2.incrementToken();
|
||||
assertEquals("second stream", "dany", termAtt_2.toString());
|
||||
}
|
||||
|
||||
/** Test stemming of mixed-case tokens. */
|
||||
public final void testCase() throws IOException {
|
||||
Analyzer a = getTestAnalyzer();
|
||||
|
||||
assertAnalyzesToReuse(a, "AGD", new String[] { "artykuły gospodarstwa domowego" });
|
||||
assertAnalyzesToReuse(a, "agd", new String[] { "artykuły gospodarstwa domowego" });
|
||||
|
||||
assertAnalyzesToReuse(a, "Poznania", new String[] { "Poznań" });
|
||||
assertAnalyzesToReuse(a, "poznania", new String[] { "poznać" });
|
||||
|
||||
assertAnalyzesToReuse(a, "Aarona", new String[] { "Aaron" });
|
||||
assertAnalyzesToReuse(a, "aarona", new String[] { "aarona" });
|
||||
|
||||
assertAnalyzesToReuse(a, "Liście", new String[] { "liść", "list", "lista" });
|
||||
}
|
||||
|
||||
private void assertPOSToken(TokenStream ts, String term, String pos) throws IOException {
|
||||
ts.incrementToken();
|
||||
assertEquals(term, ts.getAttribute(CharTermAttribute.class).toString());
|
||||
assertEquals(pos, ts.getAttribute(MorphosyntacticTagAttribute.class).getTag().toString());
|
||||
}
|
||||
|
||||
/** Test morphosyntactic annotations. */
|
||||
public final void testPOSAttribute() throws IOException {
|
||||
TokenStream ts = getTestAnalyzer().reusableTokenStream("dummy", new StringReader("liście"));
|
||||
|
||||
assertPOSToken(ts, "liść", "subst:pl:acc.nom.voc:m3");
|
||||
assertPOSToken(ts, "list", "subst:sg:loc.voc:m3");
|
||||
assertPOSToken(ts, "lista", "subst:sg:dat.loc:f");
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue