LUCENE-2341: integrating morfologik (Polish stemming/ morphosyntactic dictionary) as an analysis module.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1141671 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dawid Weiss 2011-06-30 19:12:54 +00:00
parent cec86dbc06
commit 29b09032d3
23 changed files with 815 additions and 2 deletions

View File

@ -38,6 +38,8 @@
<classpathentry kind="src" path="modules/analysis/stempel/src/java"/>
<classpathentry kind="src" path="modules/analysis/stempel/src/resources"/>
<classpathentry kind="src" path="modules/analysis/stempel/src/test"/>
<classpathentry kind="src" path="modules/analysis/morfologik/src/java"/>
<classpathentry kind="src" path="modules/analysis/morfologik/src/test"/>
<classpathentry kind="src" path="modules/benchmark/src/java"/>
<classpathentry kind="src" path="modules/benchmark/src/test"/>
<classpathentry kind="src" path="modules/common/src/java"/>
@ -83,6 +85,9 @@
<classpathentry kind="lib" path="lucene/contrib/queries/lib/jakarta-regexp-1.4.jar"/>
<classpathentry kind="lib" path="modules/analysis/icu/lib/icu4j-4_8.jar"/>
<classpathentry kind="lib" path="modules/analysis/phonetic/lib/commons-codec-1.4.jar"/>
<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar"/>
<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-polish-1.5.2.jar"/>
<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar"/>
<classpathentry kind="lib" path="modules/benchmark/lib/commons-beanutils-1.7.0.jar"/>
<classpathentry kind="lib" path="modules/benchmark/lib/commons-collections-3.1.jar"/>
<classpathentry kind="lib" path="modules/benchmark/lib/commons-compress-1.1.jar"/>

View File

@ -33,7 +33,11 @@ API Changes
in half. (Robert Muir)
New Features
* LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer
(accurate stemmer) for Polish (includes morphosyntactic annotations).
(Michał Dybizbański, Dawid Weiss)
* LUCENE-2413: Consolidated Solr analysis components into common.
New features from Solr now available to Lucene users include:
- o.a.l.analysis.commongrams: Constructs n-grams for frequently occurring terms

View File

@ -263,3 +263,80 @@ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
The following license applies to the Morfologik project:
Copyright (c) 2006 Dawid Weiss
Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Morfologik nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
---
The dictionary comes from Morfologik project. Morfologik uses data from
Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and
is licenced on the terms of (inter alia) LGPL and Creative Commons
ShareAlike. The part-of-speech tags were added in Morfologik project and
are not found in the data from sjp.pl. The tagset is similar to IPI PAN
tagset.
---
The following license applies to the Morfeusz project,
used by org.apache.lucene.analysis.morfologik.
BSD-licensed dictionary of Polish (SGJP)
http://sgjp.pl/morfeusz/
Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński,
Marcin Woliński, Robert Wołosz
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -62,3 +62,12 @@ WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/)
is derived from Unicode data such as the Unicode Character Database.
See http://unicode.org/copyright.html for more details.
The Morfologik analyzer (morfologik) includes BSD-licensed software
developed by Dawid Weiss and Marcin Miłkowski (http://morfologik.blogspot.com/).
Morfologik uses data from Polish ispell/myspell dictionary
(http://www.sjp.pl/slownik/en/) licenced on the terms of (inter alia)
LGPL and Creative Commons ShareAlike.
Morfologic includes data from BSD-licensed dictionary of Polish (SGJP)
(http://sgjp.pl/morfeusz/)

View File

@ -35,11 +35,15 @@ lucene-analyzers-stempel-XX.jar
An add-on analysis library that contains a universal algorithmic stemmer,
including tables for the Polish language.
lucene-analyzers-morfologik-XX.jar
An analyzer using the Morfologik stemming library.
common/src/java
icu/src/java
phonetic/src/java
smartcn/src/java
stempel/src/java
morfologik/src/java
The source code for the ffve libraries.
common/src/test
@ -47,4 +51,5 @@ icu/src/test
phonetic/src/test
smartcn/src/test
stempel/src/test
morfologik/src/test
Unit tests for the five libraries.

View File

@ -25,6 +25,7 @@
- icu: Analyzers that use functionality from ICU
- smartcn: Smart Analyzer for Simplified Chinese Text
- stempel: Algorithmic Stemmer for Polish
- morfologik: Morfologik Stemmer
</description>
<target name="common">
@ -47,8 +48,12 @@
<ant dir="stempel" />
</target>
<target name="morfologik">
<ant dir="morfologik" />
</target>
<target name="default" depends="compile"/>
<target name="compile" depends="common,icu,phonetic,smartcn,stempel" />
<target name="compile" depends="common,icu,phonetic,smartcn,stempel,morfologik" />
<target name="clean">
<ant dir="common" target="clean" />
@ -56,6 +61,7 @@
<ant dir="phonetic" target="clean" />
<ant dir="smartcn" target="clean" />
<ant dir="stempel" target="clean" />
<ant dir="morfologik" target="clean" />
</target>
<target name="validate">
<ant dir="common" target="validate" />
@ -63,6 +69,7 @@
<ant dir="phonetic" target="validate" />
<ant dir="smartcn" target="validate" />
<ant dir="stempel" target="validate" />
<ant dir="morfologik" target="validate" />
</target>
<target name="compile-core">
<ant dir="common" target="compile-core" />
@ -70,6 +77,7 @@
<ant dir="phonetic" target="compile-core" />
<ant dir="smartcn" target="compile-core" />
<ant dir="stempel" target="compile-core" />
<ant dir="morfologik" target="compile-core" />
</target>
<target name="compile-test">
<ant dir="common" target="compile-test" />
@ -77,6 +85,7 @@
<ant dir="phonetic" target="compile-test" />
<ant dir="smartcn" target="compile-test" />
<ant dir="stempel" target="compile-test" />
<ant dir="morfologik" target="compile-test" />
</target>
<target name="test">
<ant dir="common" target="test" />
@ -84,6 +93,7 @@
<ant dir="phonetic" target="test" />
<ant dir="smartcn" target="test" />
<ant dir="stempel" target="test" />
<ant dir="morfologik" target="test" />
</target>
<target name="build-artifacts-and-tests" depends="default,compile-test" />
@ -94,6 +104,7 @@
<ant dir="phonetic" target="dist-maven" />
<ant dir="smartcn" target="dist-maven" />
<ant dir="stempel" target="dist-maven" />
<ant dir="morfologik" target="dist-maven" />
</target>
<target name="javadocs">
@ -102,6 +113,7 @@
<ant dir="phonetic" target="javadocs" />
<ant dir="smartcn" target="javadocs" />
<ant dir="stempel" target="javadocs" />
<ant dir="morfologik" target="javadocs" />
</target>
<target name="javadocs-index.html">
@ -110,6 +122,7 @@
<ant dir="phonetic" target="javadocs-index.html" />
<ant dir="smartcn" target="javadocs-index.html" />
<ant dir="stempel" target="javadocs-index.html" />
<ant dir="morfologik" target="javadocs-index.html" />
</target>
</project>

View File

@ -0,0 +1,61 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="analyzers-morfologik" default="default">
<description>
Morfologik Analyzer
</description>
<property name="build.dir" location="../build/morfologik" />
<property name="dist.dir" location="../dist/morfologik" />
<path id="additional.dependencies">
<fileset dir="lib" includes="morfologik-fsa-*.jar"/>
<fileset dir="lib" includes="morfologik-polish-*.jar"/>
<fileset dir="lib" includes="morfologik-stemming-*.jar"/>
</path>
<pathconvert property="project.classpath" targetos="unix" refid="additional.dependencies" />
<import file="../../../lucene/contrib/contrib-build.xml"/>
<module-uptodate name="analysis/common" jarfile="../build/common/lucene-analyzers-common-${version}.jar"
property="analyzers-common.uptodate" classpath.property="analyzers-common.jar"/>
<path id="classpath">
<pathelement path="${analyzers-common.jar}"/>
<path refid="base.classpath"/>
</path>
<path id="test.classpath">
<path refid="classpath"/>
<pathelement location="../../../lucene/build/classes/test-framework/"/>
<pathelement location="../../../lucene/build/classes/test/"/>
<path refid="junit-path"/>
<pathelement location="${build.dir}/classes/java"/>
</path>
<target name="compile-core" depends="build-analyzers-common, common.compile-core" />
<target name="build-analyzers-common" unless="analyzers-common.uptodate">
<echo>Morfologik building dependency ${analyzers-common.jar}</echo>
<ant antfile="../common/build.xml" target="default" inheritall="false" dir="../common" />
</target>
</project>

View File

@ -0,0 +1,2 @@
AnyObjectId[34c0f34e37062f29497e87325b5124a033747cd5] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,29 @@
Copyright (c) 2006 Dawid Weiss
Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Morfologik nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,2 @@
This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
(http://morfologik.blogspot.com/).

View File

@ -0,0 +1,2 @@
AnyObjectId[ca2fa4d318ab91d6878614b3479628bf4325bf2e] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,41 @@
morfologik-polish, TERMS OF LICENCE
This JAR contains and makes use of data from Polish ispell/myspell
dictionaries hosted at http://www.sjp.pl/slownik/en/ and is
licenced on the terms of (inter alia) LGPL or Creative Commons ShareAlike licenses.
Part-of-speech tags were added in Morfologik project and are not found
in the data from sjp.pl.
BSD-licensed dictionary of Polish (SGJP)
http://sgjp.pl/morfeusz/
Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński,
Marcin Woliński, Robert Wołosz
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,8 @@
This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
(http://morfologik.blogspot.com/).
This product includes data from Polish ispell/myspell dictionary (http://www.sjp.pl/slownik/en/)
licenced on the terms of (inter alia) LGPL and Creative Commons ShareAlike.
This product includes data from BSD-licensed dictionary of Polish (SGJP)
(http://sgjp.pl/morfeusz/)

View File

@ -0,0 +1,2 @@
AnyObjectId[dec8226eaaa3b4a3683e7cbdbe0e526dcfffebff] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,29 @@
Copyright (c) 2006 Dawid Weiss
Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Morfologik nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,2 @@
This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
(http://morfologik.blogspot.com/).

View File

@ -0,0 +1,84 @@
// -*- c-basic-offset: 2 -*-
package org.apache.lucene.analysis.morfologik;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.apache.lucene.util.Version;
import morfologik.stemming.PolishStemmer.DICTIONARY;
/**
* {@link org.apache.lucene.analysis.Analyzer} using Morfologik library.
* @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
*/
public class MorfologikAnalyzer extends ReusableAnalyzerBase {
private final DICTIONARY dictionary;
private final Version version;
/**
* Builds an analyzer for a given PolishStemmer.DICTIONARY enum.
*
* @param vers
* lucene compatibility version
* @param dict
* A constant specifying which dictionary to choose. See the
* Morfologik documentation for details or use the default.
*/
public MorfologikAnalyzer(final Version vers, final DICTIONARY dict) {
this.version = vers;
this.dictionary = dict;
}
/**
* Builds an analyzer for an original MORFOLOGIK dictionary.
*
* @param vers lucene compatibility version
*/
public MorfologikAnalyzer(final Version vers) {
this(vers, DICTIONARY.MORFOLOGIK);
}
/**
* Creates a
* {@link ReusableAnalyzerBase.TokenStreamComponents}
* which tokenizes all the text in the provided {@link Reader}.
*
* @param field ignored field name
* @param reader source of tokens
*
* @return A
* {@link ReusableAnalyzerBase.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter} and {@link MorfologikFilter}.
*/
@Override
protected TokenStreamComponents createComponents(final String field, final Reader reader) {
final Tokenizer src = new StandardTokenizer(this.version, reader);
return new TokenStreamComponents(
src,
new MorfologikFilter(new StandardFilter(this.version, src), this.dictionary, this.version));
}
}

View File

@ -0,0 +1,134 @@
// -*- c-basic-offset: 2 -*-
package org.apache.lucene.analysis.morfologik;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import morfologik.stemming.*;
import morfologik.stemming.PolishStemmer.DICTIONARY;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.Version;
/**
* {@link TokenFilter} using Morfologik library.
* @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
*
* MorfologikFilter contains a {@link MorphosyntacticTagAttribute}, which provides morphosyntactic
* annotations for produced lemmas. See the Morfologik documentation for details.
*/
public class MorfologikFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final MorphosyntacticTagAttribute tagAtt = addAttribute(MorphosyntacticTagAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final CharsRef scratch = new CharsRef(0);
private final CharacterUtils charUtils;
private State current;
private final TokenStream input;
private final IStemmer stemmer;
private List<WordData> lemmaList;
private int lemmaListIndex;
/**
* Builds a filter for given PolishStemmer.DICTIONARY enum.
*
* @param in input token stream
* @param dict PolishStemmer.DICTIONARY enum
* @param version Lucene version compatibility for lowercasing.
*/
public MorfologikFilter(final TokenStream in, final DICTIONARY dict, final Version version) {
super(in);
this.input = in;
this.stemmer = new PolishStemmer(dict);
this.charUtils = CharacterUtils.getInstance(version);
this.lemmaList = Collections.emptyList();
}
private void popNextLemma() {
final WordData lemma = lemmaList.get(lemmaListIndex++);
termAtt.setEmpty().append(lemma.getStem());
tagAtt.setTag(lemma.getTag());
}
/**
* Lookup a given surface form of a token and update
* {@link #lemmaList} and {@link #lemmaListIndex} accordingly.
*/
private boolean lookupSurfaceForm(CharSequence token) {
lemmaList = this.stemmer.lookup(token);
lemmaListIndex = 0;
return lemmaList.size() > 0;
}
/** Retrieves the next token (possibly from the list of lemmas). */
@Override
public final boolean incrementToken() throws IOException {
if (lemmaListIndex < lemmaList.size()) {
restoreState(current);
posIncrAtt.setPositionIncrement(0);
popNextLemma();
return true;
} else if (this.input.incrementToken()) {
if (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt))) {
current = captureState();
popNextLemma();
} else {
tagAtt.clear();
}
return true;
} else {
return false;
}
}
/**
* Convert to lowercase in-place.
*/
private CharSequence toLowercase(CharSequence chs) {
final int length = scratch.length = chs.length();
scratch.grow(length);
char buffer[] = scratch.chars;
for (int i = 0; i < length;) {
i += Character.toChars(
Character.toLowerCase(charUtils.codePointAt(chs, i)), buffer, i);
}
return scratch;
}
/** Resets stems accumulator and hands over to superclass. */
@Override
public void reset() throws IOException {
lemmaListIndex = 0;
lemmaList = Collections.emptyList();
super.reset();
}
}

View File

@ -0,0 +1,40 @@
// -*- c-basic-offset: 2 -*-
package org.apache.lucene.analysis.morfologik;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.Attribute;
/**
* Morfologik dictionaries provide morphosyntactic annotations for
* surface forms. For the exact format and description of these,
* see the project's documentation (annotations vary by dictionary!).
*/
public interface MorphosyntacticTagAttribute extends Attribute {
/**
* Set the POS tag. The default value (no-value) is null.
* @param pos POS tag corresponding to current lemma
*/
public void setTag(CharSequence pos);
/** Returns the POS tag of the term. */
public CharSequence getTag();
/** Clear to default value. */
public void clear();
}

View File

@ -0,0 +1,91 @@
// -*- c-basic-offset: 2 -*-
package org.apache.lucene.analysis.morfologik;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.AttributeImpl;
/**
* @see MorphosyntacticTagAttribute
*/
public class MorphosyntacticTagAttributeImpl extends AttributeImpl
implements MorphosyntacticTagAttribute, Cloneable {
/**
* Either the original tag from WordData or a clone.
*/
private CharSequence tag;
/**
* Set the tag.
*/
public void setTag(CharSequence pos) {
this.tag = ((pos == null || pos.length() == 0) ? null : pos);
}
/**
* Returns the POS tag of the term. If you need a copy of this char sequence, clone it
* because it may change with each new term!
*/
public CharSequence getTag() {
return tag;
}
public void clear() {
tag = null;
}
public boolean equals(Object other) {
if (other instanceof MorphosyntacticTagAttribute) {
return equal(this.getTag(), ((MorphosyntacticTagAttribute) other).getTag());
}
return false;
}
/**
* Check if two char sequences are the same.
*/
private boolean equal(CharSequence chs1, CharSequence chs2) {
if (chs1 == null && chs2 == null)
return true;
if (chs1 == null || chs2 == null)
return false;
int l1 = chs1.length();
int l2 = chs2.length();
if (l1 != l2)
return false;
for (int i = 0; i < l1; i++)
if (chs1.charAt(i) != chs2.charAt(i))
return false;
return true;
}
public int hashCode() {
return this.tag == null ? 0 : tag.hashCode();
}
public void copyTo(AttributeImpl target) {
((MorphosyntacticTagAttribute) target).setTag(this.tag);
}
public Object clone() {
MorphosyntacticTagAttributeImpl cloned = new MorphosyntacticTagAttributeImpl();
cloned.tag = (tag == null ? null : tag.toString());
return cloned;
}
}

View File

@ -0,0 +1,34 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
</head>
<body>
<p>
This package provides dictionary-driven lemmatization ("accurate stemming")
filter and analyzer for the Polish Language, driven by the
<a href="http://morfologik.blogspot.com/">Morfologik library</a> developed
by Dawid Weiss and Marcin Miłkowski.
</p>
<p>
The MorfologikFilter yields one or more terms for each token. Each
of those terms is given the same position in the index.
</p>
</body>
</html>

View File

@ -0,0 +1,34 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
</head>
<body>
<p>
This package provides dictionary-driven lemmatization ("accurate stemming")
filter and analyzer for the Polish Language, driven by the
<a href="http://morfologik.blogspot.com/">Morfologik library</a> developed
by Dawid Weiss and Marcin Miłkowski.
</p>
<p>
The MorfologikFilter yields one or more terms for each token. Each
of those terms is given the same position in the index.
</p>
</body>
</html>

View File

@ -0,0 +1,105 @@
// -*- c-basic-offset: 2 -*-
package org.apache.lucene.analysis.morfologik;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* TODO: The tests below rely on the order of returned lemmas, which is probably not good.
*/
public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
private Analyzer getTestAnalyzer() {
return new MorfologikAnalyzer(TEST_VERSION_CURRENT);
}
/** Test stemming of single tokens with Morfologik library. */
public final void testSingleTokens() throws IOException {
Analyzer a = getTestAnalyzer();
assertAnalyzesToReuse(a, "a", new String[] { "a" });
assertAnalyzesToReuse(a, "liście", new String[] { "liść", "list", "lista", });
assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dane", "dać" });
assertAnalyzesToReuse(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń" });
}
/** Test stemming of multiple tokens and proper term metrics. */
public final void testMultipleTokens() throws IOException {
Analyzer a = getTestAnalyzer();
assertAnalyzesToReuse(
a,
"liście danych",
new String[] { "liść", "list", "lista", "dany", "dane", "dać" },
new int[] { 0, 0, 0, 7, 7, 7 },
new int[] { 6, 6, 6, 13, 13, 13 },
new int[] { 1, 0, 0, 1, 0, 0 });
}
/** Test reuse of MorfologikFilter with leftover stems. */
public final void testLeftoverStems() throws IOException {
Analyzer a = getTestAnalyzer();
TokenStream ts_1 = a.reusableTokenStream("dummy", new StringReader("liście"));
CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
ts_1.reset();
ts_1.incrementToken();
assertEquals("first stream", "liść", termAtt_1.toString());
TokenStream ts_2 = a.reusableTokenStream("dummy", new StringReader("danych"));
CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
ts_2.reset();
ts_2.incrementToken();
assertEquals("second stream", "dany", termAtt_2.toString());
}
/** Test stemming of mixed-case tokens. */
public final void testCase() throws IOException {
Analyzer a = getTestAnalyzer();
assertAnalyzesToReuse(a, "AGD", new String[] { "artykuły gospodarstwa domowego" });
assertAnalyzesToReuse(a, "agd", new String[] { "artykuły gospodarstwa domowego" });
assertAnalyzesToReuse(a, "Poznania", new String[] { "Poznań" });
assertAnalyzesToReuse(a, "poznania", new String[] { "poznać" });
assertAnalyzesToReuse(a, "Aarona", new String[] { "Aaron" });
assertAnalyzesToReuse(a, "aarona", new String[] { "aarona" });
assertAnalyzesToReuse(a, "Liście", new String[] { "liść", "list", "lista" });
}
private void assertPOSToken(TokenStream ts, String term, String pos) throws IOException {
ts.incrementToken();
assertEquals(term, ts.getAttribute(CharTermAttribute.class).toString());
assertEquals(pos, ts.getAttribute(MorphosyntacticTagAttribute.class).getTag().toString());
}
/** Test morphosyntactic annotations. */
public final void testPOSAttribute() throws IOException {
TokenStream ts = getTestAnalyzer().reusableTokenStream("dummy", new StringReader("liście"));
assertPOSToken(ts, "liść", "subst:pl:acc.nom.voc:m3");
assertPOSToken(ts, "list", "subst:sg:loc.voc:m3");
assertPOSToken(ts, "lista", "subst:sg:dat.loc:f");
}
}