LUCENE-2341: integrating morfologik (Polish stemming/ morphosyntactic dictionary) as an analysis module.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1141022 13f79535-47bb-0310-9956-ffa450edef68
2011-06-29 09:24:14 +00:00 · 2011-06-29 09:24:14 +00:00 · d188d3df90
parent 216e696809
commit d188d3df90
23 changed files with 816 additions and 4 deletions
--- a/dev-tools/eclipse/dot.classpath
+++ b/dev-tools/eclipse/dot.classpath
@ -38,14 +38,15 @@
 	<classpathentry kind="src" path="modules/analysis/stempel/src/java"/>
 	<classpathentry kind="src" path="modules/analysis/stempel/src/resources"/>
 	<classpathentry kind="src" path="modules/analysis/stempel/src/test"/>
 	<classpathentry kind="src" path="modules/analysis/morfologik/src/java"/>
 	<classpathentry kind="src" path="modules/analysis/morfologik/src/test"/>
 	<classpathentry kind="src" path="modules/benchmark/src/java"/>
 	<classpathentry kind="src" path="modules/benchmark/src/test"/>
 	<classpathentry kind="src" path="modules/common/src/java"/>
 	<classpathentry kind="src" path="modules/common/src/test"/>
 	<classpathentry kind="src" path="modules/grouping/src/java"/>
 	<classpathentry kind="src" path="modules/grouping/src/test"/>
-  <classpathentry kind="src" path="modules/queries/src/java"/>
+	<classpathentry kind="src" path="modules/queries/src/java"/>
 	<classpathentry kind="src" path="modules/queries/src/test"/>
 	<classpathentry kind="src" path="modules/suggest/src/java"/>
 	<classpathentry kind="src" path="modules/suggest/src/test"/>
 	<classpathentry kind="src" path="solr/src/java"/>
@ -80,6 +81,9 @@
 	<classpathentry kind="lib" path="lucene/contrib/queries/lib/jakarta-regexp-1.4.jar"/>
 	<classpathentry kind="lib" path="modules/analysis/icu/lib/icu4j-4_8.jar"/>
 	<classpathentry kind="lib" path="modules/analysis/phonetic/lib/commons-codec-1.4.jar"/>
 	<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar"/>
 	<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-polish-1.5.2.jar"/>
 	<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar"/>
 	<classpathentry kind="lib" path="modules/benchmark/lib/commons-beanutils-1.7.0.jar"/>
 	<classpathentry kind="lib" path="modules/benchmark/lib/commons-collections-3.1.jar"/>
 	<classpathentry kind="lib" path="modules/benchmark/lib/commons-compress-1.1.jar"/>
--- a/modules/analysis/CHANGES.txt
+++ b/modules/analysis/CHANGES.txt
@ -33,7 +33,11 @@ API Changes
   in half. (Robert Muir)
 New Features
-   
+
 * LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer 
   (accurate stemmer) for Polish (includes morphosyntactic annotations).
   (Michał Dybizbański, Dawid Weiss)
 * LUCENE-2413: Consolidated Solr analysis components into common. 
   New features from Solr now available to Lucene users include:
   - o.a.l.analysis.commongrams: Constructs n-grams for frequently occurring terms
--- a/modules/analysis/LICENSE.txt
+++ b/modules/analysis/LICENSE.txt
@ -263,3 +263,80 @@ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 SUCH DAMAGE.
 The following license applies to the Morfologik project:
 Copyright (c) 2006 Dawid Weiss
 Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
 All rights reserved.
 Redistribution and use in source and binary forms, with or without modification, 
 are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice, 
    this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright notice, 
    this list of conditions and the following disclaimer in the documentation 
    and/or other materials provided with the distribution.
    * Neither the name of Morfologik nor the names of its contributors 
    may be used to endorse or promote products derived from this software 
    without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ---
 The dictionary comes from Morfologik project. Morfologik uses data from 
 Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and 
 is licenced on the terms of (inter alia) LGPL and Creative Commons 
 ShareAlike. The part-of-speech tags were added in Morfologik project and
 are not found in the data from sjp.pl. The tagset is similar to IPI PAN
 tagset.
 ---
 The following license applies to the Morfeusz project,
 used by org.apache.lucene.analysis.morfologik.
 BSD-licensed dictionary of Polish (SGJP)
 http://sgjp.pl/morfeusz/
 Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński, 
 	    	 Marcin Woliński, Robert Wołosz
 All rights reserved.
 Redistribution and  use in  source and binary  forms, with  or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the
   distribution.
 THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
 OR  IMPLIED WARRANTIES,  INCLUDING, BUT  NOT LIMITED  TO,  THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED.  IN NO EVENT  SHALL COPYRIGHT  HOLDERS OR  CONTRIBUTORS BE
 LIABLE FOR  ANY DIRECT,  INDIRECT, INCIDENTAL, SPECIAL,  EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES  (INCLUDING, BUT NOT LIMITED  TO, PROCUREMENT OF
 SUBSTITUTE  GOODS OR  SERVICES;  LOSS  OF USE,  DATA,  OR PROFITS;  OR
 BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF LIABILITY,
 WHETHER IN  CONTRACT, STRICT LIABILITY, OR  TORT (INCLUDING NEGLIGENCE
 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
 IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/modules/analysis/NOTICE.txt
+++ b/modules/analysis/NOTICE.txt
@ -62,3 +62,12 @@ WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/)
 is derived from Unicode data such as the Unicode Character Database. 
 See http://unicode.org/copyright.html for more details.
 The Morfologik analyzer (morfologik) includes BSD-licensed software
 developed by Dawid Weiss and Marcin Miłkowski (http://morfologik.blogspot.com/).
 Morfologik uses data from Polish ispell/myspell dictionary
 (http://www.sjp.pl/slownik/en/) licenced on the terms of (inter alia)
 LGPL and Creative Commons ShareAlike.
 Morfologic includes data from BSD-licensed dictionary of Polish (SGJP)
 (http://sgjp.pl/morfeusz/)
--- a/modules/analysis/README.txt
+++ b/modules/analysis/README.txt
@ -35,11 +35,15 @@ lucene-analyzers-stempel-XX.jar
  An add-on analysis library that contains a universal algorithmic stemmer,
  including tables for the Polish language.
 lucene-analyzers-morfologik-XX.jar
  An analyzer using the Morfologik stemming library.
 common/src/java
 icu/src/java
 phonetic/src/java
 smartcn/src/java
 stempel/src/java
 morfologik/src/java
  The source code for the ffve libraries.
 common/src/test
@ -47,4 +51,5 @@ icu/src/test
 phonetic/src/test
 smartcn/src/test
 stempel/src/test
 morfologik/src/test
  Unit tests for the five libraries.
--- a/modules/analysis/build.xml
+++ b/modules/analysis/build.xml
@ -25,6 +25,7 @@
      - icu: Analyzers that use functionality from ICU
      - smartcn:	Smart Analyzer for Simplified Chinese Text
      - stempel:	Algorithmic Stemmer for Polish
      - morfologik:	Morfologik Stemmer
  </description>
  <target name="common">
@ -47,8 +48,12 @@
    <ant dir="stempel" />
  </target>
  <target name="morfologik">
    <ant dir="morfologik" />
  </target>
  <target name="default" depends="compile"/>
-  <target name="compile" depends="common,icu,phonetic,smartcn,stempel" />
+  <target name="compile" depends="common,icu,phonetic,smartcn,stempel,morfologik" />
  <target name="clean">
    <ant dir="common" target="clean" />
@ -56,6 +61,7 @@
    <ant dir="phonetic" target="clean" />
    <ant dir="smartcn" target="clean" />
    <ant dir="stempel" target="clean" />
    <ant dir="morfologik" target="clean" />
  </target>
  <target name="validate">
    <ant dir="common" target="validate" />
@ -63,6 +69,7 @@
    <ant dir="phonetic" target="validate" />
    <ant dir="smartcn" target="validate" />
    <ant dir="stempel" target="validate" />
    <ant dir="morfologik" target="validate" />
  </target>
  <target name="compile-core">
    <ant dir="common" target="compile-core" />
@ -70,6 +77,7 @@
    <ant dir="phonetic" target="compile-core" />
    <ant dir="smartcn" target="compile-core" />
    <ant dir="stempel" target="compile-core" />
    <ant dir="morfologik" target="compile-core" />
  </target>
  <target name="compile-test">
    <ant dir="common" target="compile-test" />
@ -77,6 +85,7 @@
    <ant dir="phonetic" target="compile-test" />
    <ant dir="smartcn" target="compile-test" />
    <ant dir="stempel" target="compile-test" />
    <ant dir="morfologik" target="compile-test" />
  </target>
  <target name="test">
    <ant dir="common" target="test" />
@ -84,6 +93,7 @@
    <ant dir="phonetic" target="test" />
    <ant dir="smartcn" target="test" />
    <ant dir="stempel" target="test" />
    <ant dir="morfologik" target="test" />
  </target>
  <target name="build-artifacts-and-tests" depends="default,compile-test" />
@ -94,6 +104,7 @@
    <ant dir="phonetic" target="dist-maven" />
    <ant dir="smartcn" target="dist-maven" />
    <ant dir="stempel" target="dist-maven" />
    <ant dir="morfologik" target="dist-maven" />
  </target>  	
  <target name="javadocs">
@ -102,6 +113,7 @@
    <ant dir="phonetic" target="javadocs" />
    <ant dir="smartcn" target="javadocs" />
    <ant dir="stempel" target="javadocs" />
    <ant dir="morfologik" target="javadocs" />
  </target>  	
  <target name="javadocs-index.html">
@ -110,6 +122,7 @@
    <ant dir="phonetic" target="javadocs-index.html" />
    <ant dir="smartcn" target="javadocs-index.html" />
    <ant dir="stempel" target="javadocs-index.html" />
    <ant dir="morfologik" target="javadocs-index.html" />
  </target>
 </project>
--- a/modules/analysis/morfologik/build.xml
+++ b/modules/analysis/morfologik/build.xml
@ -0,0 +1,61 @@
 <?xml version="1.0"?>
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
    this work for additional information regarding copyright ownership.
    The ASF licenses this file to You under the Apache License, Version 2.0
    the "License"); you may not use this file except in compliance with
    the License.  You may obtain a copy of the License at
        http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
 -->
 <project name="analyzers-morfologik" default="default">
  <description>
    Morfologik Analyzer
  </description>
  <property name="build.dir" location="../build/morfologik" />
  <property name="dist.dir" location="../dist/morfologik" />
  <path id="additional.dependencies">
    <fileset dir="lib" includes="morfologik-fsa-*.jar"/>
    <fileset dir="lib" includes="morfologik-polish-*.jar"/>
    <fileset dir="lib" includes="morfologik-stemming-*.jar"/>
  </path>
  <pathconvert property="project.classpath" targetos="unix" refid="additional.dependencies" />
  <import file="../../../lucene/contrib/contrib-build.xml"/>
  <module-uptodate name="analysis/common" jarfile="../build/common/lucene-analyzers-common-${version}.jar"
    property="analyzers-common.uptodate" classpath.property="analyzers-common.jar"/>
  <path id="classpath">
    <pathelement path="${analyzers-common.jar}"/>
    <path refid="base.classpath"/>
  </path>
  <path id="test.classpath">
    <path refid="classpath"/>
    <pathelement location="../../../lucene/build/classes/test-framework/"/>
    <pathelement location="../../../lucene/build/classes/test/"/>
    <path refid="junit-path"/>
    <pathelement location="${build.dir}/classes/java"/>
  </path>
  <target name="compile-core" depends="build-analyzers-common, common.compile-core" />
  <target name="build-analyzers-common" unless="analyzers-common.uptodate">
    <echo>Morfologik building dependency ${analyzers-common.jar}</echo>
    <ant antfile="../common/build.xml" target="default" inheritall="false" dir="../common" />
  </target>
 </project>
--- a/modules/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar
+++ b/modules/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar
@ -0,0 +1,2 @@
 AnyObjectId[34c0f34e37062f29497e87325b5124a033747cd5] was removed in git history.
 Apache SVN contains full history.
--- a/modules/analysis/morfologik/lib/morfologik-fsa-LICENSE-BSD.txt
+++ b/modules/analysis/morfologik/lib/morfologik-fsa-LICENSE-BSD.txt
@ -0,0 +1,29 @@
 Copyright (c) 2006 Dawid Weiss
 Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
 All rights reserved.
 Redistribution and use in source and binary forms, with or without modification, 
 are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice, 
    this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright notice, 
    this list of conditions and the following disclaimer in the documentation 
    and/or other materials provided with the distribution.
    * Neither the name of Morfologik nor the names of its contributors 
    may be used to endorse or promote products derived from this software 
    without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/modules/analysis/morfologik/lib/morfologik-fsa-NOTICE.txt
+++ b/modules/analysis/morfologik/lib/morfologik-fsa-NOTICE.txt
@ -0,0 +1,2 @@
 This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
 (http://morfologik.blogspot.com/).
--- a/modules/analysis/morfologik/lib/morfologik-polish-1.5.2.jar
+++ b/modules/analysis/morfologik/lib/morfologik-polish-1.5.2.jar
@ -0,0 +1,2 @@
 AnyObjectId[ca2fa4d318ab91d6878614b3479628bf4325bf2e] was removed in git history.
 Apache SVN contains full history.
--- a/modules/analysis/morfologik/lib/morfologik-polish-LICENSE-CCSA-BSD.txt
+++ b/modules/analysis/morfologik/lib/morfologik-polish-LICENSE-CCSA-BSD.txt
@ -0,0 +1,41 @@
 morfologik-polish, TERMS OF LICENCE
 This JAR contains and makes use of data from Polish ispell/myspell 
 dictionaries hosted at http://www.sjp.pl/slownik/en/ and is 
 licenced on the terms of (inter alia) LGPL or Creative Commons ShareAlike licenses.
 Part-of-speech tags were added in Morfologik project and are not found 
 in the data from sjp.pl.
 BSD-licensed dictionary of Polish (SGJP)
 http://sgjp.pl/morfeusz/
 Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński, 
 	    	 Marcin Woliński, Robert Wołosz
 All rights reserved.
 Redistribution and  use in  source and binary  forms, with  or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the
   distribution.
 THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
 OR  IMPLIED WARRANTIES,  INCLUDING, BUT  NOT LIMITED  TO,  THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED.  IN NO EVENT  SHALL COPYRIGHT  HOLDERS OR  CONTRIBUTORS BE
 LIABLE FOR  ANY DIRECT,  INDIRECT, INCIDENTAL, SPECIAL,  EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES  (INCLUDING, BUT NOT LIMITED  TO, PROCUREMENT OF
 SUBSTITUTE  GOODS OR  SERVICES;  LOSS  OF USE,  DATA,  OR PROFITS;  OR
 BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF LIABILITY,
 WHETHER IN  CONTRACT, STRICT LIABILITY, OR  TORT (INCLUDING NEGLIGENCE
 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
 IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/modules/analysis/morfologik/lib/morfologik-polish-NOTICE.txt
+++ b/modules/analysis/morfologik/lib/morfologik-polish-NOTICE.txt
@ -0,0 +1,8 @@
 This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
 (http://morfologik.blogspot.com/).
 This product includes data from Polish ispell/myspell dictionary (http://www.sjp.pl/slownik/en/)
 licenced on the terms of (inter alia) LGPL and Creative Commons ShareAlike.
 This product includes data from BSD-licensed dictionary of Polish (SGJP)
 (http://sgjp.pl/morfeusz/)
--- a/modules/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar
+++ b/modules/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar
@ -0,0 +1,2 @@
 AnyObjectId[dec8226eaaa3b4a3683e7cbdbe0e526dcfffebff] was removed in git history.
 Apache SVN contains full history.
--- a/modules/analysis/morfologik/lib/morfologik-stemming-LICENSE-BSD.txt
+++ b/modules/analysis/morfologik/lib/morfologik-stemming-LICENSE-BSD.txt
@ -0,0 +1,29 @@
 Copyright (c) 2006 Dawid Weiss
 Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
 All rights reserved.
 Redistribution and use in source and binary forms, with or without modification, 
 are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice, 
    this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright notice, 
    this list of conditions and the following disclaimer in the documentation 
    and/or other materials provided with the distribution.
    * Neither the name of Morfologik nor the names of its contributors 
    may be used to endorse or promote products derived from this software 
    without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/modules/analysis/morfologik/lib/morfologik-stemming-NOTICE.txt
+++ b/modules/analysis/morfologik/lib/morfologik-stemming-NOTICE.txt
@ -0,0 +1,2 @@
 This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
 (http://morfologik.blogspot.com/).
--- a/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
+++ b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
@ -0,0 +1,84 @@
 // -*- c-basic-offset: 2 -*-
 package org.apache.lucene.analysis.morfologik;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.Reader;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
 import org.apache.lucene.util.Version;
 import morfologik.stemming.PolishStemmer.DICTIONARY;
 /**
 * {@link org.apache.lucene.analysis.Analyzer} using Morfologik library.
 * @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
 */
 public class MorfologikAnalyzer extends ReusableAnalyzerBase {
  private final DICTIONARY dictionary;
  private final Version version;
  /**
   * Builds an analyzer for a given PolishStemmer.DICTIONARY enum.
   * 
   * @param vers
   *          lucene compatibility version
   * @param dict
   *          A constant specifying which dictionary to choose. See the
   *          Morfologik documentation for details or use the default.
   */
  public MorfologikAnalyzer(final Version vers, final DICTIONARY dict) {
    this.version = vers;
    this.dictionary = dict;
  }
  /**
   * Builds an analyzer for an original MORFOLOGIK dictionary.
   * 
   * @param vers         lucene compatibility version
   */
  public MorfologikAnalyzer(final Version vers) {
    this(vers, DICTIONARY.MORFOLOGIK);
  }
  /**
   * Creates a
   * {@link ReusableAnalyzerBase.TokenStreamComponents}
   * which tokenizes all the text in the provided {@link Reader}.
   * 
   * @param field ignored field name
   * @param reader source of tokens
   * 
   * @return A
   *         {@link ReusableAnalyzerBase.TokenStreamComponents}
   *         built from an {@link StandardTokenizer} filtered with
   *         {@link StandardFilter} and {@link MorfologikFilter}.
   */
  @Override
  protected TokenStreamComponents createComponents(final String field, final Reader reader) {
    final Tokenizer src = new StandardTokenizer(this.version, reader);
    return new TokenStreamComponents(
      src,
      new MorfologikFilter(new StandardFilter(this.version, src), this.dictionary, this.version));
  }
 }
--- a/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
+++ b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
@ -0,0 +1,134 @@
 // -*- c-basic-offset: 2 -*-
 package org.apache.lucene.analysis.morfologik;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.util.Collections;
 import java.util.List;
 import morfologik.stemming.*;
 import morfologik.stemming.PolishStemmer.DICTIONARY;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.util.CharacterUtils;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.Version;
 /**
 * {@link TokenFilter} using Morfologik library.
 * @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
 *
 * MorfologikFilter contains a {@link MorphosyntacticTagAttribute}, which provides morphosyntactic
 * annotations for produced lemmas. See the Morfologik documentation for details.
 */
 public class MorfologikFilter extends TokenFilter {
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final MorphosyntacticTagAttribute tagAtt = addAttribute(MorphosyntacticTagAttribute.class);
  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  private final CharsRef scratch = new CharsRef(0);
  private final CharacterUtils charUtils;
  private State current;
  private final TokenStream input;
  private final IStemmer stemmer;
  private List<WordData> lemmaList;
  private int lemmaListIndex;
  /**
   * Builds a filter for given PolishStemmer.DICTIONARY enum.
   * 
   * @param in   input token stream
   * @param dict PolishStemmer.DICTIONARY enum
   * @param version Lucene version compatibility for lowercasing.
   */
  public MorfologikFilter(final TokenStream in, final DICTIONARY dict, final Version version) {
    super(in);
    this.input = in;
    this.stemmer = new PolishStemmer(dict);
    this.charUtils = CharacterUtils.getInstance(version);
    this.lemmaList = Collections.emptyList();
  }
  private void popNextLemma() {
    final WordData lemma = lemmaList.get(lemmaListIndex++);
    termAtt.setEmpty().append(lemma.getStem());
    tagAtt.setTag(lemma.getTag());
  }
  /**
   * Lookup a given surface form of a token and update 
   * {@link #lemmaList} and {@link #lemmaListIndex} accordingly. 
   */
  private boolean lookupSurfaceForm(CharSequence token) {
      lemmaList = this.stemmer.lookup(token);
      lemmaListIndex = 0;
      return lemmaList.size() > 0;
  }
  /** Retrieves the next token (possibly from the list of lemmas). */
  @Override
  public final boolean incrementToken() throws IOException {
    if (lemmaListIndex < lemmaList.size()) {
      restoreState(current);
      posIncrAtt.setPositionIncrement(0);
      popNextLemma();
      return true;
    } else if (this.input.incrementToken()) {
      if (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt))) {
        current = captureState();
        popNextLemma();
      } else {
        tagAtt.clear();
      }
      return true;
    } else {
      return false;
    }
  }
  /**
   * Convert to lowercase in-place.
   */
  private CharSequence toLowercase(CharSequence chs) {
    final int length = scratch.length = chs.length();
    scratch.grow(length);
    char buffer[] = scratch.chars;
    for (int i = 0; i < length;) {
      i += Character.toChars(
          Character.toLowerCase(charUtils.codePointAt(chs, i)), buffer, i);      
    }
    return scratch;
  }
  /** Resets stems accumulator and hands over to superclass. */
  @Override
  public void reset() throws IOException {
    lemmaListIndex = 0;
    lemmaList = Collections.emptyList();
    super.reset();
  }
 }
--- a/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttribute.java
+++ b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttribute.java
@ -0,0 +1,40 @@
 // -*- c-basic-offset: 2 -*-
 package org.apache.lucene.analysis.morfologik;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import org.apache.lucene.util.Attribute;
 /** 
 * Morfologik dictionaries provide morphosyntactic annotations for
 * surface forms. For the exact format and description of these,
 * see the project's documentation (annotations vary by dictionary!).
 */
 public interface MorphosyntacticTagAttribute extends Attribute {
  /** 
   * Set the POS tag. The default value (no-value) is null.
   * @param pos POS tag corresponding to current lemma
   */
  public void setTag(CharSequence pos);
  /** Returns the POS tag of the term. */
  public CharSequence getTag();
  /** Clear to default value. */
  public void clear();
 }
--- a/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttributeImpl.java
+++ b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttributeImpl.java
@ -0,0 +1,91 @@
 // -*- c-basic-offset: 2 -*-
 package org.apache.lucene.analysis.morfologik;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import org.apache.lucene.util.AttributeImpl;
 /**
 * @see MorphosyntacticTagAttribute
 */
 public class MorphosyntacticTagAttributeImpl extends AttributeImpl 
  implements MorphosyntacticTagAttribute, Cloneable {
  /**
   * Either the original tag from WordData or a clone.
   */
  private CharSequence tag;
  /** 
   * Set the tag.
   */
  public void setTag(CharSequence pos) {
    this.tag = ((pos == null || pos.length() == 0) ? null : pos);
  }
  /**
   * Returns the POS tag of the term. If you need a copy of this char sequence, clone it
   * because it may change with each new term!
   */
  public CharSequence getTag() {
    return tag;
  }
  public void clear() {
    tag = null;
  }
  public boolean equals(Object other) {
    if (other instanceof MorphosyntacticTagAttribute) {
      return equal(this.getTag(), ((MorphosyntacticTagAttribute) other).getTag());
    }
    return false;
  }
  /**
   * Check if two char sequences are the same.
   */
  private boolean equal(CharSequence chs1, CharSequence chs2) {
    if (chs1 == null && chs2 == null)
      return true;
    if (chs1 == null || chs2 == null)
      return false;
    int l1 = chs1.length();
    int l2 = chs2.length();
    if (l1 != l2)
      return false;
    for (int i = 0; i < l1; i++)
      if (chs1.charAt(i) != chs2.charAt(i))
        return false;
    return true;
  }
  public int hashCode() {
    return this.tag == null ? 0 : tag.hashCode();
  }
  public void copyTo(AttributeImpl target) {
    ((MorphosyntacticTagAttribute) target).setTag(this.tag);
  }
  public Object clone() {
    MorphosyntacticTagAttributeImpl cloned = new MorphosyntacticTagAttributeImpl();
    cloned.tag = (tag == null ? null : tag.toString());
    return cloned;
  }
 }
--- a/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/package.html
+++ b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/package.html
@ -0,0 +1,34 @@
 <!doctype html public "-//w3c//dtd html 4.0 transitional//en">
 <!--
 Licensed to the Apache Software Foundation (ASF) under one or more
 contributor license agreements.  See the NOTICE file distributed with
 this work for additional information regarding copyright ownership.
 The ASF licenses this file to You under the Apache License, Version 2.0
 (the "License"); you may not use this file except in compliance with
 the License.  You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
 <html>
  <head>
    <meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
  </head>
  <body>
    <p>
      This package provides dictionary-driven lemmatization ("accurate stemming")
      filter and analyzer for the Polish Language, driven by the
      <a href="http://morfologik.blogspot.com/">Morfologik library</a> developed 
      by Dawid Weiss and Marcin Miłkowski.
    </p>
    <p>
    The MorfologikFilter yields one or more terms for each token. Each
    of those terms is given the same position in the index.
    </p>
  </body>
 </html>
--- a/modules/analysis/morfologik/src/java/overview.html
+++ b/modules/analysis/morfologik/src/java/overview.html
@ -0,0 +1,34 @@
 <!doctype html public "-//w3c//dtd html 4.0 transitional//en">
 <!--
 Licensed to the Apache Software Foundation (ASF) under one or more
 contributor license agreements.  See the NOTICE file distributed with
 this work for additional information regarding copyright ownership.
 The ASF licenses this file to You under the Apache License, Version 2.0
 (the "License"); you may not use this file except in compliance with
 the License.  You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
 <html>
  <head>
    <meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
  </head>
  <body>
    <p>
      This package provides dictionary-driven lemmatization ("accurate stemming")
      filter and analyzer for the Polish Language, driven by the
      <a href="http://morfologik.blogspot.com/">Morfologik library</a> developed 
      by Dawid Weiss and Marcin Miłkowski.
    </p>
    <p>
    The MorfologikFilter yields one or more terms for each token. Each
    of those terms is given the same position in the index.
    </p>
  </body>
 </html>
--- a/modules/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
+++ b/modules/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
@ -0,0 +1,105 @@
 // -*- c-basic-offset: 2 -*-
 package org.apache.lucene.analysis.morfologik;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.io.StringReader;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 /**
 * TODO: The tests below rely on the order of returned lemmas, which is probably not good. 
 */
 public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
  private Analyzer getTestAnalyzer() {
    return new MorfologikAnalyzer(TEST_VERSION_CURRENT);
  }
  /** Test stemming of single tokens with Morfologik library. */
  public final void testSingleTokens() throws IOException {
    Analyzer a = getTestAnalyzer();
    assertAnalyzesToReuse(a, "a", new String[] { "a" });
    assertAnalyzesToReuse(a, "liście", new String[] { "liść", "list", "lista", });
    assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dane", "dać" });
    assertAnalyzesToReuse(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń" });
  }
  /** Test stemming of multiple tokens and proper term metrics. */
  public final void testMultipleTokens() throws IOException {
    Analyzer a = getTestAnalyzer();
    assertAnalyzesToReuse(
      a,
      "liście danych",
      new String[] { "liść", "list", "lista", "dany", "dane", "dać" },
      new int[] { 0, 0, 0, 7, 7, 7 },
      new int[] { 6, 6, 6, 13, 13, 13 },
      new int[] { 1, 0, 0, 1, 0, 0 });
  }
  /** Test reuse of MorfologikFilter with leftover stems. */
  public final void testLeftoverStems() throws IOException {
    Analyzer a = getTestAnalyzer();
    TokenStream ts_1 = a.reusableTokenStream("dummy", new StringReader("liście"));
    CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
    ts_1.reset();
    ts_1.incrementToken();
    assertEquals("first stream", "liść", termAtt_1.toString());
    TokenStream ts_2 = a.reusableTokenStream("dummy", new StringReader("danych"));
    CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
    ts_2.reset();
    ts_2.incrementToken();
    assertEquals("second stream", "dany", termAtt_2.toString());
  }
  /** Test stemming of mixed-case tokens. */
  public final void testCase() throws IOException {
    Analyzer a = getTestAnalyzer();
    assertAnalyzesToReuse(a, "AGD",      new String[] { "artykuły gospodarstwa domowego" });
    assertAnalyzesToReuse(a, "agd",      new String[] { "artykuły gospodarstwa domowego" });
    assertAnalyzesToReuse(a, "Poznania", new String[] { "Poznań" });
    assertAnalyzesToReuse(a, "poznania", new String[] { "poznać" });
    assertAnalyzesToReuse(a, "Aarona",   new String[] { "Aaron" });
    assertAnalyzesToReuse(a, "aarona",   new String[] { "aarona" });
    assertAnalyzesToReuse(a, "Liście",   new String[] { "liść", "list", "lista" });
  }
  private void assertPOSToken(TokenStream ts, String term, String pos) throws IOException {
    ts.incrementToken();
    assertEquals(term, ts.getAttribute(CharTermAttribute.class).toString());
    assertEquals(pos,  ts.getAttribute(MorphosyntacticTagAttribute.class).getTag().toString());
  }
  /** Test morphosyntactic annotations. */
  public final void testPOSAttribute() throws IOException {
    TokenStream ts = getTestAnalyzer().reusableTokenStream("dummy", new StringReader("liście"));
    assertPOSToken(ts, "liść",  "subst:pl:acc.nom.voc:m3");
    assertPOSToken(ts, "list",  "subst:sg:loc.voc:m3");
    assertPOSToken(ts, "lista", "subst:sg:dat.loc:f");
  }
 }
		`@ -0,0 +1,2 @@`
							`AnyObjectId[34c0f34e37062f29497e87325b5124a033747cd5] was removed in git history.`
							`Apache SVN contains full history.`
		`@ -0,0 +1,2 @@`
							`This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski`
							`(http://morfologik.blogspot.com/).`
		`@ -0,0 +1,2 @@`
							`AnyObjectId[ca2fa4d318ab91d6878614b3479628bf4325bf2e] was removed in git history.`
							`Apache SVN contains full history.`
		`@ -0,0 +1,2 @@`
							`AnyObjectId[dec8226eaaa3b4a3683e7cbdbe0e526dcfffebff] was removed in git history.`
							`Apache SVN contains full history.`