LUCENE-2341: integrating morfologik (Polish stemming/ morphosyntactic dictionary) as an analysis module.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1141671 13f79535-47bb-0310-9956-ffa450edef68
2011-06-30 19:12:54 +00:00 · 2011-06-30 19:12:54 +00:00 · 29b09032d3
parent cec86dbc06
commit 29b09032d3
23 changed files with 815 additions and 2 deletions
--- a/dev-tools/eclipse/dot.classpath
+++ b/dev-tools/eclipse/dot.classpath
@ -38,6 +38,8 @@
 	<classpathentry kind="src" path="modules/analysis/stempel/src/java"/>
 	<classpathentry kind="src" path="modules/analysis/stempel/src/resources"/>
 	<classpathentry kind="src" path="modules/analysis/stempel/src/test"/>
+	<classpathentry kind="src" path="modules/analysis/morfologik/src/java"/>
+	<classpathentry kind="src" path="modules/analysis/morfologik/src/test"/>	
 	<classpathentry kind="src" path="modules/benchmark/src/java"/>
 	<classpathentry kind="src" path="modules/benchmark/src/test"/>
 	<classpathentry kind="src" path="modules/common/src/java"/>
@ -83,6 +85,9 @@
 	<classpathentry kind="lib" path="lucene/contrib/queries/lib/jakarta-regexp-1.4.jar"/>
 	<classpathentry kind="lib" path="modules/analysis/icu/lib/icu4j-4_8.jar"/>
 	<classpathentry kind="lib" path="modules/analysis/phonetic/lib/commons-codec-1.4.jar"/>
+	<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar"/>
+	<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-polish-1.5.2.jar"/>
+	<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar"/>
 	<classpathentry kind="lib" path="modules/benchmark/lib/commons-beanutils-1.7.0.jar"/>
 	<classpathentry kind="lib" path="modules/benchmark/lib/commons-collections-3.1.jar"/>
 	<classpathentry kind="lib" path="modules/benchmark/lib/commons-compress-1.1.jar"/>
--- a/modules/analysis/CHANGES.txt
+++ b/modules/analysis/CHANGES.txt
@ -33,7 +33,11 @@ API Changes
   in half. (Robert Muir)
   
 New Features
-   
+
+ * LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer 
+   (accurate stemmer) for Polish (includes morphosyntactic annotations).
+   (Michał Dybizbański, Dawid Weiss)
+
 * LUCENE-2413: Consolidated Solr analysis components into common. 
   New features from Solr now available to Lucene users include:
   - o.a.l.analysis.commongrams: Constructs n-grams for frequently occurring terms
--- a/modules/analysis/LICENSE.txt
+++ b/modules/analysis/LICENSE.txt
@ -263,3 +263,80 @@ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 SUCH DAMAGE.
+
+The following license applies to the Morfologik project:
+
+Copyright (c) 2006 Dawid Weiss
+Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, 
+are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer.
+    
+    * Redistributions in binary form must reproduce the above copyright notice, 
+    this list of conditions and the following disclaimer in the documentation 
+    and/or other materials provided with the distribution.
+    
+    * Neither the name of Morfologik nor the names of its contributors 
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+---
+
+The dictionary comes from Morfologik project. Morfologik uses data from 
+Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and 
+is licenced on the terms of (inter alia) LGPL and Creative Commons 
+ShareAlike. The part-of-speech tags were added in Morfologik project and
+are not found in the data from sjp.pl. The tagset is similar to IPI PAN
+tagset.
+
+---
+
+The following license applies to the Morfeusz project,
+used by org.apache.lucene.analysis.morfologik.
+
+BSD-licensed dictionary of Polish (SGJP)
+http://sgjp.pl/morfeusz/
+
+Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński, 
+	    	 Marcin Woliński, Robert Wołosz
+
+All rights reserved.
+
+Redistribution and  use in  source and binary  forms, with  or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the
+   distribution.
+
+THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
+OR  IMPLIED WARRANTIES,  INCLUDING, BUT  NOT LIMITED  TO,  THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED.  IN NO EVENT  SHALL COPYRIGHT  HOLDERS OR  CONTRIBUTORS BE
+LIABLE FOR  ANY DIRECT,  INDIRECT, INCIDENTAL, SPECIAL,  EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES  (INCLUDING, BUT NOT LIMITED  TO, PROCUREMENT OF
+SUBSTITUTE  GOODS OR  SERVICES;  LOSS  OF USE,  DATA,  OR PROFITS;  OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF LIABILITY,
+WHETHER IN  CONTRACT, STRICT LIABILITY, OR  TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/modules/analysis/NOTICE.txt
+++ b/modules/analysis/NOTICE.txt
@ -62,3 +62,12 @@ WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/)
 is derived from Unicode data such as the Unicode Character Database. 
 See http://unicode.org/copyright.html for more details.

+The Morfologik analyzer (morfologik) includes BSD-licensed software
+developed by Dawid Weiss and Marcin Miłkowski (http://morfologik.blogspot.com/).
+
+Morfologik uses data from Polish ispell/myspell dictionary
+(http://www.sjp.pl/slownik/en/) licenced on the terms of (inter alia)
+LGPL and Creative Commons ShareAlike.
+
+Morfologic includes data from BSD-licensed dictionary of Polish (SGJP)
+(http://sgjp.pl/morfeusz/)
--- a/modules/analysis/README.txt
+++ b/modules/analysis/README.txt
@ -35,11 +35,15 @@ lucene-analyzers-stempel-XX.jar
  An add-on analysis library that contains a universal algorithmic stemmer,
  including tables for the Polish language.

+lucene-analyzers-morfologik-XX.jar
+  An analyzer using the Morfologik stemming library.
+
 common/src/java
 icu/src/java
 phonetic/src/java
 smartcn/src/java
 stempel/src/java
+morfologik/src/java
  The source code for the ffve libraries.

 common/src/test
@ -47,4 +51,5 @@ icu/src/test
 phonetic/src/test
 smartcn/src/test
 stempel/src/test
+morfologik/src/test
  Unit tests for the five libraries.
--- a/modules/analysis/build.xml
+++ b/modules/analysis/build.xml
@ -25,6 +25,7 @@
      - icu: Analyzers that use functionality from ICU
      - smartcn:	Smart Analyzer for Simplified Chinese Text
      - stempel:	Algorithmic Stemmer for Polish
+      - morfologik:	Morfologik Stemmer
  </description>

  <target name="common">
@ -47,8 +48,12 @@
    <ant dir="stempel" />
  </target>

+  <target name="morfologik">
+    <ant dir="morfologik" />
+  </target>
+
  <target name="default" depends="compile"/>
-  <target name="compile" depends="common,icu,phonetic,smartcn,stempel" />
+  <target name="compile" depends="common,icu,phonetic,smartcn,stempel,morfologik" />

  <target name="clean">
    <ant dir="common" target="clean" />
@ -56,6 +61,7 @@
    <ant dir="phonetic" target="clean" />
    <ant dir="smartcn" target="clean" />
    <ant dir="stempel" target="clean" />
+    <ant dir="morfologik" target="clean" />
  </target>
  <target name="validate">
    <ant dir="common" target="validate" />
@ -63,6 +69,7 @@
    <ant dir="phonetic" target="validate" />
    <ant dir="smartcn" target="validate" />
    <ant dir="stempel" target="validate" />
+    <ant dir="morfologik" target="validate" />
  </target>
  <target name="compile-core">
    <ant dir="common" target="compile-core" />
@ -70,6 +77,7 @@
    <ant dir="phonetic" target="compile-core" />
    <ant dir="smartcn" target="compile-core" />
    <ant dir="stempel" target="compile-core" />
+    <ant dir="morfologik" target="compile-core" />
  </target>
  <target name="compile-test">
    <ant dir="common" target="compile-test" />
@ -77,6 +85,7 @@
    <ant dir="phonetic" target="compile-test" />
    <ant dir="smartcn" target="compile-test" />
    <ant dir="stempel" target="compile-test" />
+    <ant dir="morfologik" target="compile-test" />
  </target>
  <target name="test">
    <ant dir="common" target="test" />
@ -84,6 +93,7 @@
    <ant dir="phonetic" target="test" />
    <ant dir="smartcn" target="test" />
    <ant dir="stempel" target="test" />
+    <ant dir="morfologik" target="test" />
  </target>

  <target name="build-artifacts-and-tests" depends="default,compile-test" />
@ -94,6 +104,7 @@
    <ant dir="phonetic" target="dist-maven" />
    <ant dir="smartcn" target="dist-maven" />
    <ant dir="stempel" target="dist-maven" />
+    <ant dir="morfologik" target="dist-maven" />
  </target>  	

  <target name="javadocs">
@ -102,6 +113,7 @@
    <ant dir="phonetic" target="javadocs" />
    <ant dir="smartcn" target="javadocs" />
    <ant dir="stempel" target="javadocs" />
+    <ant dir="morfologik" target="javadocs" />
  </target>  	

  <target name="javadocs-index.html">
@ -110,6 +122,7 @@
    <ant dir="phonetic" target="javadocs-index.html" />
    <ant dir="smartcn" target="javadocs-index.html" />
    <ant dir="stempel" target="javadocs-index.html" />
+    <ant dir="morfologik" target="javadocs-index.html" />
  </target>
 	
 </project>
--- a/modules/analysis/morfologik/build.xml
+++ b/modules/analysis/morfologik/build.xml
@ -0,0 +1,61 @@
+<?xml version="1.0"?>
+
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+ 
+        http://www.apache.org/licenses/LICENSE-2.0
+ 
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+ -->
+
+<project name="analyzers-morfologik" default="default">
+
+  <description>
+    Morfologik Analyzer
+  </description>
+	
+  <property name="build.dir" location="../build/morfologik" />
+  <property name="dist.dir" location="../dist/morfologik" />
+
+  <path id="additional.dependencies">
+    <fileset dir="lib" includes="morfologik-fsa-*.jar"/>
+    <fileset dir="lib" includes="morfologik-polish-*.jar"/>
+    <fileset dir="lib" includes="morfologik-stemming-*.jar"/>
+  </path>
+
+  <pathconvert property="project.classpath" targetos="unix" refid="additional.dependencies" />
+
+  <import file="../../../lucene/contrib/contrib-build.xml"/>
+
+  <module-uptodate name="analysis/common" jarfile="../build/common/lucene-analyzers-common-${version}.jar"
+    property="analyzers-common.uptodate" classpath.property="analyzers-common.jar"/>
+
+  <path id="classpath">
+    <pathelement path="${analyzers-common.jar}"/>
+    <path refid="base.classpath"/>
+  </path>
+
+  <path id="test.classpath">
+    <path refid="classpath"/>
+    <pathelement location="../../../lucene/build/classes/test-framework/"/>
+    <pathelement location="../../../lucene/build/classes/test/"/>
+    <path refid="junit-path"/>
+    <pathelement location="${build.dir}/classes/java"/>
+  </path>
+
+  <target name="compile-core" depends="build-analyzers-common, common.compile-core" />
+
+  <target name="build-analyzers-common" unless="analyzers-common.uptodate">
+    <echo>Morfologik building dependency ${analyzers-common.jar}</echo>
+    <ant antfile="../common/build.xml" target="default" inheritall="false" dir="../common" />
+  </target>
+</project>
--- a/modules/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar
+++ b/modules/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar
@ -0,0 +1,2 @@
+AnyObjectId[34c0f34e37062f29497e87325b5124a033747cd5] was removed in git history.
+Apache SVN contains full history.
--- a/modules/analysis/morfologik/lib/morfologik-fsa-LICENSE-BSD.txt
+++ b/modules/analysis/morfologik/lib/morfologik-fsa-LICENSE-BSD.txt
@ -0,0 +1,29 @@
+
+Copyright (c) 2006 Dawid Weiss
+Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, 
+are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer.
+    
+    * Redistributions in binary form must reproduce the above copyright notice, 
+    this list of conditions and the following disclaimer in the documentation 
+    and/or other materials provided with the distribution.
+    
+    * Neither the name of Morfologik nor the names of its contributors 
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/modules/analysis/morfologik/lib/morfologik-fsa-NOTICE.txt
+++ b/modules/analysis/morfologik/lib/morfologik-fsa-NOTICE.txt
@ -0,0 +1,2 @@
+This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
+(http://morfologik.blogspot.com/).
--- a/modules/analysis/morfologik/lib/morfologik-polish-1.5.2.jar
+++ b/modules/analysis/morfologik/lib/morfologik-polish-1.5.2.jar
@ -0,0 +1,2 @@
+AnyObjectId[ca2fa4d318ab91d6878614b3479628bf4325bf2e] was removed in git history.
+Apache SVN contains full history.
--- a/modules/analysis/morfologik/lib/morfologik-polish-LICENSE-CCSA-BSD.txt
+++ b/modules/analysis/morfologik/lib/morfologik-polish-LICENSE-CCSA-BSD.txt
@ -0,0 +1,41 @@
+morfologik-polish, TERMS OF LICENCE
+
+This JAR contains and makes use of data from Polish ispell/myspell 
+dictionaries hosted at http://www.sjp.pl/slownik/en/ and is 
+licenced on the terms of (inter alia) LGPL or Creative Commons ShareAlike licenses.
+
+Part-of-speech tags were added in Morfologik project and are not found 
+in the data from sjp.pl.
+
+
+BSD-licensed dictionary of Polish (SGJP)
+http://sgjp.pl/morfeusz/
+
+Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński, 
+	    	 Marcin Woliński, Robert Wołosz
+
+All rights reserved.
+
+Redistribution and  use in  source and binary  forms, with  or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the
+   distribution.
+
+THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
+OR  IMPLIED WARRANTIES,  INCLUDING, BUT  NOT LIMITED  TO,  THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED.  IN NO EVENT  SHALL COPYRIGHT  HOLDERS OR  CONTRIBUTORS BE
+LIABLE FOR  ANY DIRECT,  INDIRECT, INCIDENTAL, SPECIAL,  EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES  (INCLUDING, BUT NOT LIMITED  TO, PROCUREMENT OF
+SUBSTITUTE  GOODS OR  SERVICES;  LOSS  OF USE,  DATA,  OR PROFITS;  OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF LIABILITY,
+WHETHER IN  CONTRACT, STRICT LIABILITY, OR  TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/modules/analysis/morfologik/lib/morfologik-polish-NOTICE.txt
+++ b/modules/analysis/morfologik/lib/morfologik-polish-NOTICE.txt
@ -0,0 +1,8 @@
+This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
+(http://morfologik.blogspot.com/).
+
+This product includes data from Polish ispell/myspell dictionary (http://www.sjp.pl/slownik/en/)
+licenced on the terms of (inter alia) LGPL and Creative Commons ShareAlike.
+
+This product includes data from BSD-licensed dictionary of Polish (SGJP)
+(http://sgjp.pl/morfeusz/)
--- a/modules/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar
+++ b/modules/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar
@ -0,0 +1,2 @@
+AnyObjectId[dec8226eaaa3b4a3683e7cbdbe0e526dcfffebff] was removed in git history.
+Apache SVN contains full history.
--- a/modules/analysis/morfologik/lib/morfologik-stemming-LICENSE-BSD.txt
+++ b/modules/analysis/morfologik/lib/morfologik-stemming-LICENSE-BSD.txt
@ -0,0 +1,29 @@
+
+Copyright (c) 2006 Dawid Weiss
+Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, 
+are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer.
+    
+    * Redistributions in binary form must reproduce the above copyright notice, 
+    this list of conditions and the following disclaimer in the documentation 
+    and/or other materials provided with the distribution.
+    
+    * Neither the name of Morfologik nor the names of its contributors 
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/modules/analysis/morfologik/lib/morfologik-stemming-NOTICE.txt
+++ b/modules/analysis/morfologik/lib/morfologik-stemming-NOTICE.txt
@ -0,0 +1,2 @@
+This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
+(http://morfologik.blogspot.com/).
--- a/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
+++ b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
@ -0,0 +1,84 @@
+// -*- c-basic-offset: 2 -*-
+package org.apache.lucene.analysis.morfologik;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.apache.lucene.util.Version;
+
+import morfologik.stemming.PolishStemmer.DICTIONARY;
+
+/**
+ * {@link org.apache.lucene.analysis.Analyzer} using Morfologik library.
+ * @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
+ */
+public class MorfologikAnalyzer extends ReusableAnalyzerBase {
+
+  private final DICTIONARY dictionary;
+  private final Version version;
+
+  /**
+   * Builds an analyzer for a given PolishStemmer.DICTIONARY enum.
+   * 
+   * @param vers
+   *          lucene compatibility version
+   * @param dict
+   *          A constant specifying which dictionary to choose. See the
+   *          Morfologik documentation for details or use the default.
+   */
+  public MorfologikAnalyzer(final Version vers, final DICTIONARY dict) {
+    this.version = vers;
+    this.dictionary = dict;
+  }
+
+  /**
+   * Builds an analyzer for an original MORFOLOGIK dictionary.
+   * 
+   * @param vers         lucene compatibility version
+   */
+  public MorfologikAnalyzer(final Version vers) {
+    this(vers, DICTIONARY.MORFOLOGIK);
+  }
+
+  /**
+   * Creates a
+   * {@link ReusableAnalyzerBase.TokenStreamComponents}
+   * which tokenizes all the text in the provided {@link Reader}.
+   * 
+   * @param field ignored field name
+   * @param reader source of tokens
+   * 
+   * @return A
+   *         {@link ReusableAnalyzerBase.TokenStreamComponents}
+   *         built from an {@link StandardTokenizer} filtered with
+   *         {@link StandardFilter} and {@link MorfologikFilter}.
+   */
+  @Override
+  protected TokenStreamComponents createComponents(final String field, final Reader reader) {
+    final Tokenizer src = new StandardTokenizer(this.version, reader);
+    
+    return new TokenStreamComponents(
+      src,
+      new MorfologikFilter(new StandardFilter(this.version, src), this.dictionary, this.version));
+  }
+}
--- a/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
+++ b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
@ -0,0 +1,134 @@
+// -*- c-basic-offset: 2 -*-
+package org.apache.lucene.analysis.morfologik;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+
+import morfologik.stemming.*;
+import morfologik.stemming.PolishStemmer.DICTIONARY;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.util.CharacterUtils;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.Version;
+
+/**
+ * {@link TokenFilter} using Morfologik library.
+ * @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
+ *
+ * MorfologikFilter contains a {@link MorphosyntacticTagAttribute}, which provides morphosyntactic
+ * annotations for produced lemmas. See the Morfologik documentation for details.
+ */
+public class MorfologikFilter extends TokenFilter {
+
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final MorphosyntacticTagAttribute tagAtt = addAttribute(MorphosyntacticTagAttribute.class);
+  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+
+  private final CharsRef scratch = new CharsRef(0);
+  private final CharacterUtils charUtils;
+
+  private State current;
+  private final TokenStream input;
+  private final IStemmer stemmer;
+  
+  private List<WordData> lemmaList;
+  private int lemmaListIndex;
+
+  /**
+   * Builds a filter for given PolishStemmer.DICTIONARY enum.
+   * 
+   * @param in   input token stream
+   * @param dict PolishStemmer.DICTIONARY enum
+   * @param version Lucene version compatibility for lowercasing.
+   */
+  public MorfologikFilter(final TokenStream in, final DICTIONARY dict, final Version version) {
+    super(in);
+    this.input = in;
+    this.stemmer = new PolishStemmer(dict);
+    this.charUtils = CharacterUtils.getInstance(version);
+    this.lemmaList = Collections.emptyList();
+  }
+
+  private void popNextLemma() {
+    final WordData lemma = lemmaList.get(lemmaListIndex++);
+    termAtt.setEmpty().append(lemma.getStem());
+    tagAtt.setTag(lemma.getTag());
+  }
+
+  /**
+   * Lookup a given surface form of a token and update 
+   * {@link #lemmaList} and {@link #lemmaListIndex} accordingly. 
+   */
+  private boolean lookupSurfaceForm(CharSequence token) {
+      lemmaList = this.stemmer.lookup(token);
+      lemmaListIndex = 0;
+      return lemmaList.size() > 0;
+  }
+
+  /** Retrieves the next token (possibly from the list of lemmas). */
+  @Override
+  public final boolean incrementToken() throws IOException {
+    if (lemmaListIndex < lemmaList.size()) {
+      restoreState(current);
+      posIncrAtt.setPositionIncrement(0);
+      popNextLemma();
+      return true;
+    } else if (this.input.incrementToken()) {
+      if (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt))) {
+        current = captureState();
+        popNextLemma();
+      } else {
+        tagAtt.clear();
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * Convert to lowercase in-place.
+   */
+  private CharSequence toLowercase(CharSequence chs) {
+    final int length = scratch.length = chs.length();
+    scratch.grow(length);
+
+    char buffer[] = scratch.chars;
+    for (int i = 0; i < length;) {
+      i += Character.toChars(
+          Character.toLowerCase(charUtils.codePointAt(chs, i)), buffer, i);      
+    }
+
+    return scratch;
+  }
+
+  /** Resets stems accumulator and hands over to superclass. */
+  @Override
+  public void reset() throws IOException {
+    lemmaListIndex = 0;
+    lemmaList = Collections.emptyList();
+    super.reset();
+  }
+}
--- a/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttribute.java
+++ b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttribute.java
@ -0,0 +1,40 @@
+// -*- c-basic-offset: 2 -*-
+package org.apache.lucene.analysis.morfologik;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.Attribute;
+
+/** 
+ * Morfologik dictionaries provide morphosyntactic annotations for
+ * surface forms. For the exact format and description of these,
+ * see the project's documentation (annotations vary by dictionary!).
+ */
+public interface MorphosyntacticTagAttribute extends Attribute {
+  /** 
+   * Set the POS tag. The default value (no-value) is null.
+   * @param pos POS tag corresponding to current lemma
+   */
+  public void setTag(CharSequence pos);
+
+  /** Returns the POS tag of the term. */
+  public CharSequence getTag();
+
+  /** Clear to default value. */
+  public void clear();
+}
--- a/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttributeImpl.java
+++ b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttributeImpl.java
@ -0,0 +1,91 @@
+// -*- c-basic-offset: 2 -*-
+package org.apache.lucene.analysis.morfologik;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.AttributeImpl;
+
+/**
+ * @see MorphosyntacticTagAttribute
+ */
+public class MorphosyntacticTagAttributeImpl extends AttributeImpl 
+  implements MorphosyntacticTagAttribute, Cloneable {
+
+  /**
+   * Either the original tag from WordData or a clone.
+   */
+  private CharSequence tag;
+
+  /** 
+   * Set the tag.
+   */
+  public void setTag(CharSequence pos) {
+    this.tag = ((pos == null || pos.length() == 0) ? null : pos);
+  }
+
+  /**
+   * Returns the POS tag of the term. If you need a copy of this char sequence, clone it
+   * because it may change with each new term!
+   */
+  public CharSequence getTag() {
+    return tag;
+  }
+
+  public void clear() {
+    tag = null;
+  }
+
+  public boolean equals(Object other) {
+    if (other instanceof MorphosyntacticTagAttribute) {
+      return equal(this.getTag(), ((MorphosyntacticTagAttribute) other).getTag());
+    }
+    return false;
+  }
+
+  /**
+   * Check if two char sequences are the same.
+   */
+  private boolean equal(CharSequence chs1, CharSequence chs2) {
+    if (chs1 == null && chs2 == null)
+      return true;
+    if (chs1 == null || chs2 == null)
+      return false;
+    int l1 = chs1.length();
+    int l2 = chs2.length();
+    if (l1 != l2)
+      return false;
+    for (int i = 0; i < l1; i++)
+      if (chs1.charAt(i) != chs2.charAt(i))
+        return false;
+    return true;
+  }
+
+  public int hashCode() {
+    return this.tag == null ? 0 : tag.hashCode();
+  }
+
+  public void copyTo(AttributeImpl target) {
+    ((MorphosyntacticTagAttribute) target).setTag(this.tag);
+  }
+
+  public Object clone() {
+    MorphosyntacticTagAttributeImpl cloned = new MorphosyntacticTagAttributeImpl();
+    cloned.tag = (tag == null ? null : tag.toString());
+    return cloned;
+  }
+}
--- a/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/package.html
+++ b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/package.html
@ -0,0 +1,34 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+  <head>
+    <meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
+  </head>
+  <body>
+    <p>
+      This package provides dictionary-driven lemmatization ("accurate stemming")
+      filter and analyzer for the Polish Language, driven by the
+      <a href="http://morfologik.blogspot.com/">Morfologik library</a> developed 
+      by Dawid Weiss and Marcin Miłkowski.
+    </p>
+    <p>
+    The MorfologikFilter yields one or more terms for each token. Each
+    of those terms is given the same position in the index.
+    </p>
+  </body>
+</html>
--- a/modules/analysis/morfologik/src/java/overview.html
+++ b/modules/analysis/morfologik/src/java/overview.html
@ -0,0 +1,34 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+  <head>
+    <meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
+  </head>
+  <body>
+    <p>
+      This package provides dictionary-driven lemmatization ("accurate stemming")
+      filter and analyzer for the Polish Language, driven by the
+      <a href="http://morfologik.blogspot.com/">Morfologik library</a> developed 
+      by Dawid Weiss and Marcin Miłkowski.
+    </p>
+    <p>
+    The MorfologikFilter yields one or more terms for each token. Each
+    of those terms is given the same position in the index.
+    </p>
+  </body>
+</html>
--- a/modules/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
+++ b/modules/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
@ -0,0 +1,105 @@
+// -*- c-basic-offset: 2 -*-
+package org.apache.lucene.analysis.morfologik;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * TODO: The tests below rely on the order of returned lemmas, which is probably not good. 
+ */
+public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
+
+  private Analyzer getTestAnalyzer() {
+    return new MorfologikAnalyzer(TEST_VERSION_CURRENT);
+  }
+
+  /** Test stemming of single tokens with Morfologik library. */
+  public final void testSingleTokens() throws IOException {
+    Analyzer a = getTestAnalyzer();
+    assertAnalyzesToReuse(a, "a", new String[] { "a" });
+    assertAnalyzesToReuse(a, "liście", new String[] { "liść", "list", "lista", });
+    assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dane", "dać" });
+    assertAnalyzesToReuse(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń" });
+  }
+
+  /** Test stemming of multiple tokens and proper term metrics. */
+  public final void testMultipleTokens() throws IOException {
+    Analyzer a = getTestAnalyzer();
+    assertAnalyzesToReuse(
+      a,
+      "liście danych",
+      new String[] { "liść", "list", "lista", "dany", "dane", "dać" },
+      new int[] { 0, 0, 0, 7, 7, 7 },
+      new int[] { 6, 6, 6, 13, 13, 13 },
+      new int[] { 1, 0, 0, 1, 0, 0 });
+  }
+
+  /** Test reuse of MorfologikFilter with leftover stems. */
+  public final void testLeftoverStems() throws IOException {
+    Analyzer a = getTestAnalyzer();
+    TokenStream ts_1 = a.reusableTokenStream("dummy", new StringReader("liście"));
+    CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
+    ts_1.reset();
+    ts_1.incrementToken();
+    assertEquals("first stream", "liść", termAtt_1.toString());
+
+    TokenStream ts_2 = a.reusableTokenStream("dummy", new StringReader("danych"));
+    CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
+    ts_2.reset();
+    ts_2.incrementToken();
+    assertEquals("second stream", "dany", termAtt_2.toString());
+  }
+
+  /** Test stemming of mixed-case tokens. */
+  public final void testCase() throws IOException {
+    Analyzer a = getTestAnalyzer();
+
+    assertAnalyzesToReuse(a, "AGD",      new String[] { "artykuły gospodarstwa domowego" });
+    assertAnalyzesToReuse(a, "agd",      new String[] { "artykuły gospodarstwa domowego" });
+
+    assertAnalyzesToReuse(a, "Poznania", new String[] { "Poznań" });
+    assertAnalyzesToReuse(a, "poznania", new String[] { "poznać" });
+
+    assertAnalyzesToReuse(a, "Aarona",   new String[] { "Aaron" });
+    assertAnalyzesToReuse(a, "aarona",   new String[] { "aarona" });
+
+    assertAnalyzesToReuse(a, "Liście",   new String[] { "liść", "list", "lista" });
+  }
+
+  private void assertPOSToken(TokenStream ts, String term, String pos) throws IOException {
+    ts.incrementToken();
+    assertEquals(term, ts.getAttribute(CharTermAttribute.class).toString());
+    assertEquals(pos,  ts.getAttribute(MorphosyntacticTagAttribute.class).getTag().toString());
+  }
+
+  /** Test morphosyntactic annotations. */
+  public final void testPOSAttribute() throws IOException {
+    TokenStream ts = getTestAnalyzer().reusableTokenStream("dummy", new StringReader("liście"));
+
+    assertPOSToken(ts, "liść",  "subst:pl:acc.nom.voc:m3");
+    assertPOSToken(ts, "list",  "subst:sg:loc.voc:m3");
+    assertPOSToken(ts, "lista", "subst:sg:dat.loc:f");
+  }
+}