From 29b09032d35ffa217c896cc639d9eb51b6cf5b02 Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Thu, 30 Jun 2011 19:12:54 +0000 Subject: [PATCH] LUCENE-2341: integrating morfologik (Polish stemming/ morphosyntactic dictionary) as an analysis module. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1141671 13f79535-47bb-0310-9956-ffa450edef68 --- dev-tools/eclipse/dot.classpath | 5 + modules/analysis/CHANGES.txt | 6 +- modules/analysis/LICENSE.txt | 77 ++++++++++ modules/analysis/NOTICE.txt | 9 ++ modules/analysis/README.txt | 5 + modules/analysis/build.xml | 15 +- modules/analysis/morfologik/build.xml | 61 ++++++++ .../morfologik/lib/morfologik-fsa-1.5.2.jar | 2 + .../lib/morfologik-fsa-LICENSE-BSD.txt | 29 ++++ .../morfologik/lib/morfologik-fsa-NOTICE.txt | 2 + .../lib/morfologik-polish-1.5.2.jar | 2 + .../morfologik-polish-LICENSE-CCSA-BSD.txt | 41 ++++++ .../lib/morfologik-polish-NOTICE.txt | 8 ++ .../lib/morfologik-stemming-1.5.2.jar | 2 + .../lib/morfologik-stemming-LICENSE-BSD.txt | 29 ++++ .../lib/morfologik-stemming-NOTICE.txt | 2 + .../morfologik/MorfologikAnalyzer.java | 84 +++++++++++ .../analysis/morfologik/MorfologikFilter.java | 134 ++++++++++++++++++ .../MorphosyntacticTagAttribute.java | 40 ++++++ .../MorphosyntacticTagAttributeImpl.java | 91 ++++++++++++ .../lucene/analysis/morfologik/package.html | 34 +++++ .../morfologik/src/java/overview.html | 34 +++++ .../morfologik/TestMorfologikAnalyzer.java | 105 ++++++++++++++ 23 files changed, 815 insertions(+), 2 deletions(-) create mode 100644 modules/analysis/morfologik/build.xml create mode 100644 modules/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar create mode 100644 modules/analysis/morfologik/lib/morfologik-fsa-LICENSE-BSD.txt create mode 100644 modules/analysis/morfologik/lib/morfologik-fsa-NOTICE.txt create mode 100644 modules/analysis/morfologik/lib/morfologik-polish-1.5.2.jar create mode 100644 modules/analysis/morfologik/lib/morfologik-polish-LICENSE-CCSA-BSD.txt create mode 100644 modules/analysis/morfologik/lib/morfologik-polish-NOTICE.txt create mode 100644 modules/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar create mode 100644 modules/analysis/morfologik/lib/morfologik-stemming-LICENSE-BSD.txt create mode 100644 modules/analysis/morfologik/lib/morfologik-stemming-NOTICE.txt create mode 100644 modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java create mode 100644 modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java create mode 100644 modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttribute.java create mode 100644 modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttributeImpl.java create mode 100644 modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/package.html create mode 100644 modules/analysis/morfologik/src/java/overview.html create mode 100644 modules/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java diff --git a/dev-tools/eclipse/dot.classpath b/dev-tools/eclipse/dot.classpath index 594167ae471..a16aab7f1f2 100644 --- a/dev-tools/eclipse/dot.classpath +++ b/dev-tools/eclipse/dot.classpath @@ -38,6 +38,8 @@ + + @@ -83,6 +85,9 @@ + + + diff --git a/modules/analysis/CHANGES.txt b/modules/analysis/CHANGES.txt index 75698330ccb..b7716d487cf 100644 --- a/modules/analysis/CHANGES.txt +++ b/modules/analysis/CHANGES.txt @@ -33,7 +33,11 @@ API Changes in half. (Robert Muir) New Features - + + * LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer + (accurate stemmer) for Polish (includes morphosyntactic annotations). + (Michał Dybizbański, Dawid Weiss) + * LUCENE-2413: Consolidated Solr analysis components into common. New features from Solr now available to Lucene users include: - o.a.l.analysis.commongrams: Constructs n-grams for frequently occurring terms diff --git a/modules/analysis/LICENSE.txt b/modules/analysis/LICENSE.txt index 255d25119c8..e7b9831af34 100644 --- a/modules/analysis/LICENSE.txt +++ b/modules/analysis/LICENSE.txt @@ -263,3 +263,80 @@ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The following license applies to the Morfologik project: + +Copyright (c) 2006 Dawid Weiss +Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of Morfologik nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--- + +The dictionary comes from Morfologik project. Morfologik uses data from +Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and +is licenced on the terms of (inter alia) LGPL and Creative Commons +ShareAlike. The part-of-speech tags were added in Morfologik project and +are not found in the data from sjp.pl. The tagset is similar to IPI PAN +tagset. + +--- + +The following license applies to the Morfeusz project, +used by org.apache.lucene.analysis.morfologik. + +BSD-licensed dictionary of Polish (SGJP) +http://sgjp.pl/morfeusz/ + +Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński, + Marcin Woliński, Robert Wołosz + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS +OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/modules/analysis/NOTICE.txt b/modules/analysis/NOTICE.txt index aa5585558e7..61d1022ff7a 100644 --- a/modules/analysis/NOTICE.txt +++ b/modules/analysis/NOTICE.txt @@ -62,3 +62,12 @@ WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/) is derived from Unicode data such as the Unicode Character Database. See http://unicode.org/copyright.html for more details. +The Morfologik analyzer (morfologik) includes BSD-licensed software +developed by Dawid Weiss and Marcin Miłkowski (http://morfologik.blogspot.com/). + +Morfologik uses data from Polish ispell/myspell dictionary +(http://www.sjp.pl/slownik/en/) licenced on the terms of (inter alia) +LGPL and Creative Commons ShareAlike. + +Morfologic includes data from BSD-licensed dictionary of Polish (SGJP) +(http://sgjp.pl/morfeusz/) diff --git a/modules/analysis/README.txt b/modules/analysis/README.txt index 95c73c31370..2642268a8c8 100644 --- a/modules/analysis/README.txt +++ b/modules/analysis/README.txt @@ -35,11 +35,15 @@ lucene-analyzers-stempel-XX.jar An add-on analysis library that contains a universal algorithmic stemmer, including tables for the Polish language. +lucene-analyzers-morfologik-XX.jar + An analyzer using the Morfologik stemming library. + common/src/java icu/src/java phonetic/src/java smartcn/src/java stempel/src/java +morfologik/src/java The source code for the ffve libraries. common/src/test @@ -47,4 +51,5 @@ icu/src/test phonetic/src/test smartcn/src/test stempel/src/test +morfologik/src/test Unit tests for the five libraries. diff --git a/modules/analysis/build.xml b/modules/analysis/build.xml index 4c299f0f7e1..5292da2b959 100644 --- a/modules/analysis/build.xml +++ b/modules/analysis/build.xml @@ -25,6 +25,7 @@ - icu: Analyzers that use functionality from ICU - smartcn: Smart Analyzer for Simplified Chinese Text - stempel: Algorithmic Stemmer for Polish + - morfologik: Morfologik Stemmer @@ -47,8 +48,12 @@ + + + + - + @@ -56,6 +61,7 @@ + @@ -63,6 +69,7 @@ + @@ -70,6 +77,7 @@ + @@ -77,6 +85,7 @@ + @@ -84,6 +93,7 @@ + @@ -94,6 +104,7 @@ + @@ -102,6 +113,7 @@ + @@ -110,6 +122,7 @@ + diff --git a/modules/analysis/morfologik/build.xml b/modules/analysis/morfologik/build.xml new file mode 100644 index 00000000000..7767d67f004 --- /dev/null +++ b/modules/analysis/morfologik/build.xml @@ -0,0 +1,61 @@ + + + + + + + + Morfologik Analyzer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Morfologik building dependency ${analyzers-common.jar} + + + diff --git a/modules/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar b/modules/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar new file mode 100644 index 00000000000..1f6840dad48 --- /dev/null +++ b/modules/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar @@ -0,0 +1,2 @@ +AnyObjectId[34c0f34e37062f29497e87325b5124a033747cd5] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/modules/analysis/morfologik/lib/morfologik-fsa-LICENSE-BSD.txt b/modules/analysis/morfologik/lib/morfologik-fsa-LICENSE-BSD.txt new file mode 100644 index 00000000000..2684a835b73 --- /dev/null +++ b/modules/analysis/morfologik/lib/morfologik-fsa-LICENSE-BSD.txt @@ -0,0 +1,29 @@ + +Copyright (c) 2006 Dawid Weiss +Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of Morfologik nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/modules/analysis/morfologik/lib/morfologik-fsa-NOTICE.txt b/modules/analysis/morfologik/lib/morfologik-fsa-NOTICE.txt new file mode 100644 index 00000000000..18ba2f3e39c --- /dev/null +++ b/modules/analysis/morfologik/lib/morfologik-fsa-NOTICE.txt @@ -0,0 +1,2 @@ +This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski +(http://morfologik.blogspot.com/). diff --git a/modules/analysis/morfologik/lib/morfologik-polish-1.5.2.jar b/modules/analysis/morfologik/lib/morfologik-polish-1.5.2.jar new file mode 100644 index 00000000000..ca88e24004b --- /dev/null +++ b/modules/analysis/morfologik/lib/morfologik-polish-1.5.2.jar @@ -0,0 +1,2 @@ +AnyObjectId[ca2fa4d318ab91d6878614b3479628bf4325bf2e] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/modules/analysis/morfologik/lib/morfologik-polish-LICENSE-CCSA-BSD.txt b/modules/analysis/morfologik/lib/morfologik-polish-LICENSE-CCSA-BSD.txt new file mode 100644 index 00000000000..9cf51c25a96 --- /dev/null +++ b/modules/analysis/morfologik/lib/morfologik-polish-LICENSE-CCSA-BSD.txt @@ -0,0 +1,41 @@ +morfologik-polish, TERMS OF LICENCE + +This JAR contains and makes use of data from Polish ispell/myspell +dictionaries hosted at http://www.sjp.pl/slownik/en/ and is +licenced on the terms of (inter alia) LGPL or Creative Commons ShareAlike licenses. + +Part-of-speech tags were added in Morfologik project and are not found +in the data from sjp.pl. + + +BSD-licensed dictionary of Polish (SGJP) +http://sgjp.pl/morfeusz/ + +Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński, + Marcin Woliński, Robert Wołosz + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS +OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/modules/analysis/morfologik/lib/morfologik-polish-NOTICE.txt b/modules/analysis/morfologik/lib/morfologik-polish-NOTICE.txt new file mode 100644 index 00000000000..6667b14220c --- /dev/null +++ b/modules/analysis/morfologik/lib/morfologik-polish-NOTICE.txt @@ -0,0 +1,8 @@ +This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski +(http://morfologik.blogspot.com/). + +This product includes data from Polish ispell/myspell dictionary (http://www.sjp.pl/slownik/en/) +licenced on the terms of (inter alia) LGPL and Creative Commons ShareAlike. + +This product includes data from BSD-licensed dictionary of Polish (SGJP) +(http://sgjp.pl/morfeusz/) diff --git a/modules/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar b/modules/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar new file mode 100644 index 00000000000..5f99ef5d26d --- /dev/null +++ b/modules/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar @@ -0,0 +1,2 @@ +AnyObjectId[dec8226eaaa3b4a3683e7cbdbe0e526dcfffebff] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/modules/analysis/morfologik/lib/morfologik-stemming-LICENSE-BSD.txt b/modules/analysis/morfologik/lib/morfologik-stemming-LICENSE-BSD.txt new file mode 100644 index 00000000000..2684a835b73 --- /dev/null +++ b/modules/analysis/morfologik/lib/morfologik-stemming-LICENSE-BSD.txt @@ -0,0 +1,29 @@ + +Copyright (c) 2006 Dawid Weiss +Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of Morfologik nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/modules/analysis/morfologik/lib/morfologik-stemming-NOTICE.txt b/modules/analysis/morfologik/lib/morfologik-stemming-NOTICE.txt new file mode 100644 index 00000000000..18ba2f3e39c --- /dev/null +++ b/modules/analysis/morfologik/lib/morfologik-stemming-NOTICE.txt @@ -0,0 +1,2 @@ +This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski +(http://morfologik.blogspot.com/). diff --git a/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java new file mode 100644 index 00000000000..74d0cf65679 --- /dev/null +++ b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java @@ -0,0 +1,84 @@ +// -*- c-basic-offset: 2 -*- +package org.apache.lucene.analysis.morfologik; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; +import org.apache.lucene.util.Version; + +import morfologik.stemming.PolishStemmer.DICTIONARY; + +/** + * {@link org.apache.lucene.analysis.Analyzer} using Morfologik library. + * @see Morfologik project page + */ +public class MorfologikAnalyzer extends ReusableAnalyzerBase { + + private final DICTIONARY dictionary; + private final Version version; + + /** + * Builds an analyzer for a given PolishStemmer.DICTIONARY enum. + * + * @param vers + * lucene compatibility version + * @param dict + * A constant specifying which dictionary to choose. See the + * Morfologik documentation for details or use the default. + */ + public MorfologikAnalyzer(final Version vers, final DICTIONARY dict) { + this.version = vers; + this.dictionary = dict; + } + + /** + * Builds an analyzer for an original MORFOLOGIK dictionary. + * + * @param vers lucene compatibility version + */ + public MorfologikAnalyzer(final Version vers) { + this(vers, DICTIONARY.MORFOLOGIK); + } + + /** + * Creates a + * {@link ReusableAnalyzerBase.TokenStreamComponents} + * which tokenizes all the text in the provided {@link Reader}. + * + * @param field ignored field name + * @param reader source of tokens + * + * @return A + * {@link ReusableAnalyzerBase.TokenStreamComponents} + * built from an {@link StandardTokenizer} filtered with + * {@link StandardFilter} and {@link MorfologikFilter}. + */ + @Override + protected TokenStreamComponents createComponents(final String field, final Reader reader) { + final Tokenizer src = new StandardTokenizer(this.version, reader); + + return new TokenStreamComponents( + src, + new MorfologikFilter(new StandardFilter(this.version, src), this.dictionary, this.version)); + } +} diff --git a/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java new file mode 100644 index 00000000000..64780e507b8 --- /dev/null +++ b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java @@ -0,0 +1,134 @@ +// -*- c-basic-offset: 2 -*- +package org.apache.lucene.analysis.morfologik; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collections; +import java.util.List; + +import morfologik.stemming.*; +import morfologik.stemming.PolishStemmer.DICTIONARY; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.util.CharacterUtils; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.Version; + +/** + * {@link TokenFilter} using Morfologik library. + * @see Morfologik project page + * + * MorfologikFilter contains a {@link MorphosyntacticTagAttribute}, which provides morphosyntactic + * annotations for produced lemmas. See the Morfologik documentation for details. + */ +public class MorfologikFilter extends TokenFilter { + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final MorphosyntacticTagAttribute tagAtt = addAttribute(MorphosyntacticTagAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + + private final CharsRef scratch = new CharsRef(0); + private final CharacterUtils charUtils; + + private State current; + private final TokenStream input; + private final IStemmer stemmer; + + private List lemmaList; + private int lemmaListIndex; + + /** + * Builds a filter for given PolishStemmer.DICTIONARY enum. + * + * @param in input token stream + * @param dict PolishStemmer.DICTIONARY enum + * @param version Lucene version compatibility for lowercasing. + */ + public MorfologikFilter(final TokenStream in, final DICTIONARY dict, final Version version) { + super(in); + this.input = in; + this.stemmer = new PolishStemmer(dict); + this.charUtils = CharacterUtils.getInstance(version); + this.lemmaList = Collections.emptyList(); + } + + private void popNextLemma() { + final WordData lemma = lemmaList.get(lemmaListIndex++); + termAtt.setEmpty().append(lemma.getStem()); + tagAtt.setTag(lemma.getTag()); + } + + /** + * Lookup a given surface form of a token and update + * {@link #lemmaList} and {@link #lemmaListIndex} accordingly. + */ + private boolean lookupSurfaceForm(CharSequence token) { + lemmaList = this.stemmer.lookup(token); + lemmaListIndex = 0; + return lemmaList.size() > 0; + } + + /** Retrieves the next token (possibly from the list of lemmas). */ + @Override + public final boolean incrementToken() throws IOException { + if (lemmaListIndex < lemmaList.size()) { + restoreState(current); + posIncrAtt.setPositionIncrement(0); + popNextLemma(); + return true; + } else if (this.input.incrementToken()) { + if (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt))) { + current = captureState(); + popNextLemma(); + } else { + tagAtt.clear(); + } + return true; + } else { + return false; + } + } + + /** + * Convert to lowercase in-place. + */ + private CharSequence toLowercase(CharSequence chs) { + final int length = scratch.length = chs.length(); + scratch.grow(length); + + char buffer[] = scratch.chars; + for (int i = 0; i < length;) { + i += Character.toChars( + Character.toLowerCase(charUtils.codePointAt(chs, i)), buffer, i); + } + + return scratch; + } + + /** Resets stems accumulator and hands over to superclass. */ + @Override + public void reset() throws IOException { + lemmaListIndex = 0; + lemmaList = Collections.emptyList(); + super.reset(); + } +} diff --git a/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttribute.java b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttribute.java new file mode 100644 index 00000000000..a1950eef43e --- /dev/null +++ b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttribute.java @@ -0,0 +1,40 @@ +// -*- c-basic-offset: 2 -*- +package org.apache.lucene.analysis.morfologik; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.Attribute; + +/** + * Morfologik dictionaries provide morphosyntactic annotations for + * surface forms. For the exact format and description of these, + * see the project's documentation (annotations vary by dictionary!). + */ +public interface MorphosyntacticTagAttribute extends Attribute { + /** + * Set the POS tag. The default value (no-value) is null. + * @param pos POS tag corresponding to current lemma + */ + public void setTag(CharSequence pos); + + /** Returns the POS tag of the term. */ + public CharSequence getTag(); + + /** Clear to default value. */ + public void clear(); +} diff --git a/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttributeImpl.java b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttributeImpl.java new file mode 100644 index 00000000000..11ff3d5fd36 --- /dev/null +++ b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttributeImpl.java @@ -0,0 +1,91 @@ +// -*- c-basic-offset: 2 -*- +package org.apache.lucene.analysis.morfologik; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.AttributeImpl; + +/** + * @see MorphosyntacticTagAttribute + */ +public class MorphosyntacticTagAttributeImpl extends AttributeImpl + implements MorphosyntacticTagAttribute, Cloneable { + + /** + * Either the original tag from WordData or a clone. + */ + private CharSequence tag; + + /** + * Set the tag. + */ + public void setTag(CharSequence pos) { + this.tag = ((pos == null || pos.length() == 0) ? null : pos); + } + + /** + * Returns the POS tag of the term. If you need a copy of this char sequence, clone it + * because it may change with each new term! + */ + public CharSequence getTag() { + return tag; + } + + public void clear() { + tag = null; + } + + public boolean equals(Object other) { + if (other instanceof MorphosyntacticTagAttribute) { + return equal(this.getTag(), ((MorphosyntacticTagAttribute) other).getTag()); + } + return false; + } + + /** + * Check if two char sequences are the same. + */ + private boolean equal(CharSequence chs1, CharSequence chs2) { + if (chs1 == null && chs2 == null) + return true; + if (chs1 == null || chs2 == null) + return false; + int l1 = chs1.length(); + int l2 = chs2.length(); + if (l1 != l2) + return false; + for (int i = 0; i < l1; i++) + if (chs1.charAt(i) != chs2.charAt(i)) + return false; + return true; + } + + public int hashCode() { + return this.tag == null ? 0 : tag.hashCode(); + } + + public void copyTo(AttributeImpl target) { + ((MorphosyntacticTagAttribute) target).setTag(this.tag); + } + + public Object clone() { + MorphosyntacticTagAttributeImpl cloned = new MorphosyntacticTagAttributeImpl(); + cloned.tag = (tag == null ? null : tag.toString()); + return cloned; + } +} diff --git a/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/package.html b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/package.html new file mode 100644 index 00000000000..6b67d0e4c34 --- /dev/null +++ b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/package.html @@ -0,0 +1,34 @@ + + + + + + + +

+ This package provides dictionary-driven lemmatization ("accurate stemming") + filter and analyzer for the Polish Language, driven by the + Morfologik library developed + by Dawid Weiss and Marcin Miłkowski. +

+

+ The MorfologikFilter yields one or more terms for each token. Each + of those terms is given the same position in the index. +

+ + diff --git a/modules/analysis/morfologik/src/java/overview.html b/modules/analysis/morfologik/src/java/overview.html new file mode 100644 index 00000000000..6b67d0e4c34 --- /dev/null +++ b/modules/analysis/morfologik/src/java/overview.html @@ -0,0 +1,34 @@ + + + + + + + +

+ This package provides dictionary-driven lemmatization ("accurate stemming") + filter and analyzer for the Polish Language, driven by the + Morfologik library developed + by Dawid Weiss and Marcin Miłkowski. +

+

+ The MorfologikFilter yields one or more terms for each token. Each + of those terms is given the same position in the index. +

+ + diff --git a/modules/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java b/modules/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java new file mode 100644 index 00000000000..42154ba08ca --- /dev/null +++ b/modules/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java @@ -0,0 +1,105 @@ +// -*- c-basic-offset: 2 -*- +package org.apache.lucene.analysis.morfologik; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +/** + * TODO: The tests below rely on the order of returned lemmas, which is probably not good. + */ +public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase { + + private Analyzer getTestAnalyzer() { + return new MorfologikAnalyzer(TEST_VERSION_CURRENT); + } + + /** Test stemming of single tokens with Morfologik library. */ + public final void testSingleTokens() throws IOException { + Analyzer a = getTestAnalyzer(); + assertAnalyzesToReuse(a, "a", new String[] { "a" }); + assertAnalyzesToReuse(a, "liście", new String[] { "liść", "list", "lista", }); + assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dane", "dać" }); + assertAnalyzesToReuse(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń" }); + } + + /** Test stemming of multiple tokens and proper term metrics. */ + public final void testMultipleTokens() throws IOException { + Analyzer a = getTestAnalyzer(); + assertAnalyzesToReuse( + a, + "liście danych", + new String[] { "liść", "list", "lista", "dany", "dane", "dać" }, + new int[] { 0, 0, 0, 7, 7, 7 }, + new int[] { 6, 6, 6, 13, 13, 13 }, + new int[] { 1, 0, 0, 1, 0, 0 }); + } + + /** Test reuse of MorfologikFilter with leftover stems. */ + public final void testLeftoverStems() throws IOException { + Analyzer a = getTestAnalyzer(); + TokenStream ts_1 = a.reusableTokenStream("dummy", new StringReader("liście")); + CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class); + ts_1.reset(); + ts_1.incrementToken(); + assertEquals("first stream", "liść", termAtt_1.toString()); + + TokenStream ts_2 = a.reusableTokenStream("dummy", new StringReader("danych")); + CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class); + ts_2.reset(); + ts_2.incrementToken(); + assertEquals("second stream", "dany", termAtt_2.toString()); + } + + /** Test stemming of mixed-case tokens. */ + public final void testCase() throws IOException { + Analyzer a = getTestAnalyzer(); + + assertAnalyzesToReuse(a, "AGD", new String[] { "artykuły gospodarstwa domowego" }); + assertAnalyzesToReuse(a, "agd", new String[] { "artykuły gospodarstwa domowego" }); + + assertAnalyzesToReuse(a, "Poznania", new String[] { "Poznań" }); + assertAnalyzesToReuse(a, "poznania", new String[] { "poznać" }); + + assertAnalyzesToReuse(a, "Aarona", new String[] { "Aaron" }); + assertAnalyzesToReuse(a, "aarona", new String[] { "aarona" }); + + assertAnalyzesToReuse(a, "Liście", new String[] { "liść", "list", "lista" }); + } + + private void assertPOSToken(TokenStream ts, String term, String pos) throws IOException { + ts.incrementToken(); + assertEquals(term, ts.getAttribute(CharTermAttribute.class).toString()); + assertEquals(pos, ts.getAttribute(MorphosyntacticTagAttribute.class).getTag().toString()); + } + + /** Test morphosyntactic annotations. */ + public final void testPOSAttribute() throws IOException { + TokenStream ts = getTestAnalyzer().reusableTokenStream("dummy", new StringReader("liście")); + + assertPOSToken(ts, "liść", "subst:pl:acc.nom.voc:m3"); + assertPOSToken(ts, "list", "subst:sg:loc.voc:m3"); + assertPOSToken(ts, "lista", "subst:sg:dat.loc:f"); + } +}