LUCENE-4138: Update morfologik (polish stemming) to release 1.5.3. Changed the way morphosyntactic tags are exposed (a list of tags for a single lemma instead of a compound tag).

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1354840 13f79535-47bb-0310-9956-ffa450edef68
2012-06-28 07:35:36 +00:00 · 2012-06-28 07:35:36 +00:00 · 8127865e2d
parent 25dd0dd17b
commit 8127865e2d
22 changed files with 253 additions and 159 deletions
--- a/dev-tools/eclipse/dot.classpath
+++ b/dev-tools/eclipse/dot.classpath
@ -97,9 +97,9 @@
 	<classpathentry kind="lib" path="lucene/sandbox/lib/jakarta-regexp-1.4.jar"/>
 	<classpathentry kind="lib" path="lucene/analysis/icu/lib/icu4j-4.8.1.1.jar"/>
 	<classpathentry kind="lib" path="lucene/analysis/phonetic/lib/commons-codec-1.6.jar"/>
-	<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar"/>
-	<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-polish-1.5.2.jar"/>
-	<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar"/>
+	<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-fsa-1.5.3.jar"/>
+	<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-polish-1.5.3.jar"/>
+	<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar"/>
 	<classpathentry kind="lib" path="lucene/benchmark/lib/commons-compress-1.2.jar"/>
 	<classpathentry kind="lib" path="lucene/benchmark/lib/xercesImpl-2.9.1.jar"/>
 	<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>
--- a/dev-tools/maven/pom.xml.template
+++ b/dev-tools/maven/pom.xml.template
@ -303,7 +303,7 @@
      <dependency>
        <groupId>org.carrot2</groupId>
        <artifactId>morfologik-polish</artifactId>
-        <version>1.5.2</version>
+        <version>1.5.3</version>
      </dependency>
      <dependency>
        <groupId>org.codehaus.woodstox</groupId>
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -7,6 +7,16 @@ http://s.apache.org/luceneversions
 ======================= Lucene 5.0.0 =======================


+======================= Lucene 4.0.0-BETA =======================
+
+API Changes
+
+* LUCENE-4138: update of morfologik (Polish morphological analyzer) to 1.5.3.
+  The tag attribute class has been renamed to MorphosyntacticTagsAttribute and
+  has a different API (carries a list of tags instead of a compound tag). Upgrade
+  of embedded morfologik dictionaries to version 1.9. (Dawid Weiss)
+
+
 ======================= Lucene 4.0.0-ALPHA =======================

 More information about this release, including any errata related to the 
--- a/lucene/analysis/morfologik/build.xml
+++ b/lucene/analysis/morfologik/build.xml
@ -27,9 +27,9 @@

  <path id="classpath">
    <pathelement path="${analyzers-common.jar}"/>
-    <pathelement path="lib/morfologik-fsa-1.5.2.jar"/>
-    <pathelement path="lib/morfologik-polish-1.5.2.jar"/>
-    <pathelement path="lib/morfologik-stemming-1.5.2.jar"/>
+    <pathelement path="lib/morfologik-fsa-1.5.3.jar"/>
+    <pathelement path="lib/morfologik-polish-1.5.3.jar"/>
+    <pathelement path="lib/morfologik-stemming-1.5.3.jar"/>
    <path refid="base.classpath"/>
  </path>

--- a/lucene/analysis/morfologik/ivy.xml
+++ b/lucene/analysis/morfologik/ivy.xml
@ -19,9 +19,9 @@
 <ivy-module version="2.0">
    <info organisation="org.apache.lucene" module="analyzers-morfologik"/>
    <dependencies>
-      <dependency org="org.carrot2" name="morfologik-polish" rev="1.5.2" transitive="false"/>
-      <dependency org="org.carrot2" name="morfologik-fsa" rev="1.5.2" transitive="false"/>
-      <dependency org="org.carrot2" name="morfologik-stemming" rev="1.5.2" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-polish" rev="1.5.3" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-fsa" rev="1.5.3" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-stemming" rev="1.5.3" transitive="false"/>
      <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/> 
    </dependencies>
 </ivy-module>
--- a/lucene/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar.sha1
+++ b/lucene/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar.sha1
@ -1 +0,0 @@
-1513ee81494d7856f607ff8fffc74b4c6cbe0d48
--- a/lucene/analysis/morfologik/lib/morfologik-fsa-1.5.3.jar.sha1
+++ b/lucene/analysis/morfologik/lib/morfologik-fsa-1.5.3.jar.sha1
@ -0,0 +1 @@
+d1f729cd3019e6d86485226202f84458141a5688
--- a/lucene/analysis/morfologik/lib/morfologik-fsa-LICENSE-BSD.txt
+++ b/lucene/analysis/morfologik/lib/morfologik-fsa-LICENSE-BSD.txt
@ -1,6 +1,6 @@

 Copyright (c) 2006 Dawid Weiss
-Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
+Copyright (c) 2007-2012 Dawid Weiss, Marcin Miłkowski
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification, 
--- a/lucene/analysis/morfologik/lib/morfologik-polish-1.5.2.jar.sha1
+++ b/lucene/analysis/morfologik/lib/morfologik-polish-1.5.2.jar.sha1
@ -1 +0,0 @@
-ee23a00580efe973aafa6f2c225e52951832901b
--- a/lucene/analysis/morfologik/lib/morfologik-polish-1.5.3.jar.sha1
+++ b/lucene/analysis/morfologik/lib/morfologik-polish-1.5.3.jar.sha1
@ -0,0 +1 @@
+8217b6f7ad018ceda0e824b2e60340000da4397a
--- a/lucene/analysis/morfologik/lib/morfologik-polish-LICENSE-COMPOUND.txt
+++ b/lucene/analysis/morfologik/lib/morfologik-polish-LICENSE-COMPOUND.txt
@ -1,13 +1,33 @@
-morfologik-polish, TERMS OF LICENCE
+BSD-licensed dictionary of Polish (Morfologik)

-This JAR contains and makes use of data from Polish ispell/myspell 
-dictionaries hosted at http://www.sjp.pl/slownik/en/ and is 
-licenced on the terms of (inter alia): GPL, LGPL, MPL or CC-SA licenses.
+Copyright (c) 2012, Marcin Miłkowski
+All rights reserved.

-Part-of-speech tags were added in Morfologik project and are not found 
-in the data from sjp.pl. 
+Redistribution and  use in  source and binary  forms, with  or without
+modification, are permitted provided that the following conditions are
+met:

-----
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the
+   distribution.
+
+THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
+OR  IMPLIED WARRANTIES,  INCLUDING, BUT  NOT LIMITED  TO,  THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED.  IN NO EVENT  SHALL COPYRIGHT  HOLDERS OR  CONTRIBUTORS BE
+LIABLE FOR  ANY DIRECT,  INDIRECT, INCIDENTAL, SPECIAL,  EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES  (INCLUDING, BUT NOT LIMITED  TO, PROCUREMENT OF
+SUBSTITUTE  GOODS OR  SERVICES;  LOSS  OF USE,  DATA,  OR PROFITS;  OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF LIABILITY,
+WHETHER IN  CONTRACT, STRICT LIABILITY, OR  TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--

 BSD-licensed dictionary of Polish (SGJP)
 http://sgjp.pl/morfeusz/
--- a/lucene/analysis/morfologik/lib/morfologik-polish-NOTICE.txt
+++ b/lucene/analysis/morfologik/lib/morfologik-polish-NOTICE.txt
@ -1,9 +1,6 @@
-This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
-(http://morfologik.blogspot.com/).

-This JAR contains and makes use of data from Polish ispell/myspell 
-dictionaries hosted at http://www.sjp.pl/slownik/en/ and is 
-licenced on the terms of (inter alia): GPL, LGPL, MPL or CC-SA licenses.
+This product includes data from BSD-licensed dictionary of Polish (Morfologik)
+(http://morfologik.blogspot.com/)

 This product includes data from BSD-licensed dictionary of Polish (SGJP)
 (http://sgjp.pl/morfeusz/)
--- a/lucene/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar.sha1
+++ b/lucene/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar.sha1
@ -1 +0,0 @@
-eba98b7cd049e07d55a64b180345954b62e42ec5
--- a/lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar.sha1
+++ b/lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar.sha1
@ -0,0 +1 @@
+c4ead57b78fa71b00553ff21da6fb5a326e914e8
--- a/lucene/analysis/morfologik/lib/morfologik-stemming-LICENSE-BSD.txt
+++ b/lucene/analysis/morfologik/lib/morfologik-stemming-LICENSE-BSD.txt
@ -1,6 +1,6 @@

 Copyright (c) 2006 Dawid Weiss
-Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
+Copyright (c) 2007-2012 Dawid Weiss, Marcin Miłkowski
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification, 
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
@ -19,8 +19,7 @@ package org.apache.lucene.analysis.morfologik;
 */

 import java.io.IOException;
-import java.util.Collections;
-import java.util.List;
+import java.util.*;

 import morfologik.stemming.*;
 import morfologik.stemming.PolishStemmer.DICTIONARY;
@ -30,13 +29,12 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.util.CharacterUtils;
-import org.apache.lucene.util.CharsRef;
-import org.apache.lucene.util.Version;
+import org.apache.lucene.util.*;

 /**
 * {@link TokenFilter} using Morfologik library.
 *
- * MorfologikFilter contains a {@link MorphosyntacticTagAttribute}, which provides morphosyntactic
+ * MorfologikFilter contains a {@link MorphosyntacticTagsAttribute}, which provides morphosyntactic
 * annotations for produced lemmas. See the Morfologik documentation for details.
 * 
 * @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
@ -44,7 +42,7 @@ import org.apache.lucene.util.Version;
 public class MorfologikFilter extends TokenFilter {

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final MorphosyntacticTagAttribute tagAtt = addAttribute(MorphosyntacticTagAttribute.class);
+  private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class);
  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);

  private final CharsRef scratch = new CharsRef(0);
@ -55,6 +53,8 @@ public class MorfologikFilter extends TokenFilter {
  private final IStemmer stemmer;
  
  private List<WordData> lemmaList;
+  private final ArrayList<StringBuilder> tagsList = new ArrayList<StringBuilder>();
+
  private int lemmaListIndex;

  /**
@ -73,9 +73,43 @@ public class MorfologikFilter extends TokenFilter {
  }

  private void popNextLemma() {
-    final WordData lemma = lemmaList.get(lemmaListIndex++);
-    termAtt.setEmpty().append(lemma.getStem());
-    tagAtt.setTag(lemma.getTag());
+    // Collect all tags for the next unique lemma.
+    CharSequence currentStem;
+    int tags = 0;
+    do {
+      final WordData lemma = lemmaList.get(lemmaListIndex++);
+      currentStem = lemma.getStem();
+      final CharSequence tag = lemma.getTag();
+      if (tag != null) {
+        if (tagsList.size() <= tags) {
+          tagsList.add(new StringBuilder());
+        }
+
+        final StringBuilder buffer = tagsList.get(tags++);  
+        buffer.setLength(0);
+        buffer.append(lemma.getTag());
+      }
+    } while (lemmaListIndex < lemmaList.size() &&
+             equalCharSequences(lemmaList.get(lemmaListIndex).getStem(), currentStem));
+
+    // Set the lemma's base form and tags as attributes.
+    termAtt.setEmpty().append(currentStem);
+    tagsAtt.setTags(tagsList.subList(0, tags));
+  }
+
+  /**
+   * Compare two char sequences for equality. Assumes non-null arguments. 
+   */
+  private static final boolean equalCharSequences(CharSequence s1, CharSequence s2) {
+    int len1 = s1.length();
+    int len2 = s2.length();
+    if (len1 != len2) return false;
+    for (int i = len1; --i >= 0;) {
+      if (s1.charAt(i) != s2.charAt(i)) { 
+        return false; 
+      }
+    }
+    return true;
  }

  /**
@ -101,7 +135,7 @@ public class MorfologikFilter extends TokenFilter {
        current = captureState();
        popNextLemma();
      } else {
-        tagAtt.clear();
+        tagsAtt.clear();
      }
      return true;
    } else {
@ -130,6 +164,7 @@ public class MorfologikFilter extends TokenFilter {
  public void reset() throws IOException {
    lemmaListIndex = 0;
    lemmaList = Collections.emptyList();
+    tagsList.clear();
    super.reset();
  }
 }
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttributeImpl.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttributeImpl.java
@ -1,92 +0,0 @@
-// -*- c-basic-offset: 2 -*-
-package org.apache.lucene.analysis.morfologik;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.util.AttributeImpl;
-
-/**
- * Morphosyntactic annotations for surface forms.
- * @see MorphosyntacticTagAttribute
- */
-public class MorphosyntacticTagAttributeImpl extends AttributeImpl 
-  implements MorphosyntacticTagAttribute, Cloneable {
-
-  /**
-   * Either the original tag from WordData or a clone.
-   */
-  private CharSequence tag;
-
-  /** 
-   * Set the tag.
-   */
-  public void setTag(CharSequence pos) {
-    this.tag = ((pos == null || pos.length() == 0) ? null : pos);
-  }
-
-  /**
-   * Returns the POS tag of the term. If you need a copy of this char sequence, clone it
-   * because it may change with each new term!
-   */
-  public CharSequence getTag() {
-    return tag;
-  }
-
-  public void clear() {
-    tag = null;
-  }
-
-  public boolean equals(Object other) {
-    if (other instanceof MorphosyntacticTagAttribute) {
-      return equal(this.getTag(), ((MorphosyntacticTagAttribute) other).getTag());
-    }
-    return false;
-  }
-
-  /**
-   * Check if two char sequences are the same.
-   */
-  private boolean equal(CharSequence chs1, CharSequence chs2) {
-    if (chs1 == null && chs2 == null)
-      return true;
-    if (chs1 == null || chs2 == null)
-      return false;
-    int l1 = chs1.length();
-    int l2 = chs2.length();
-    if (l1 != l2)
-      return false;
-    for (int i = 0; i < l1; i++)
-      if (chs1.charAt(i) != chs2.charAt(i))
-        return false;
-    return true;
-  }
-
-  public int hashCode() {
-    return this.tag == null ? 0 : tag.hashCode();
-  }
-
-  public void copyTo(AttributeImpl target) {
-    ((MorphosyntacticTagAttribute) target).setTag(this.tag);
-  }
-
-  public MorphosyntacticTagAttributeImpl clone() {
-    MorphosyntacticTagAttributeImpl cloned = new MorphosyntacticTagAttributeImpl();
-    cloned.tag = (tag == null ? null : tag.toString());
-    return cloned;
-  }
-}
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java
@ -18,6 +18,8 @@ package org.apache.lucene.analysis.morfologik;
 * limitations under the License.
 */

+import java.util.List;
+
 import org.apache.lucene.util.Attribute;

 /** 
@ -25,15 +27,18 @@ import org.apache.lucene.util.Attribute;
 * surface forms. For the exact format and description of these,
 * see the project's documentation (annotations vary by dictionary!).
 */
-public interface MorphosyntacticTagAttribute extends Attribute {
+public interface MorphosyntacticTagsAttribute extends Attribute {
  /** 
   * Set the POS tag. The default value (no-value) is null.
-   * @param pos POS tag corresponding to current lemma
+   * 
+   * @param tags A list of POS tags corresponding to current lemma.
   */
-  public void setTag(CharSequence pos);
+  public void setTags(List<StringBuilder> tags);

-  /** Returns the POS tag of the term. */
-  public CharSequence getTag();
+  /** 
+   * Returns the POS tag of the term.
+   */
+  public List<StringBuilder> getTags();

  /** Clear to default value. */
  public void clear();
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttributeImpl.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttributeImpl.java
@ -0,0 +1,96 @@
+// -*- c-basic-offset: 2 -*-
+package org.apache.lucene.analysis.morfologik;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.*;
+
+import org.apache.lucene.util.AttributeImpl;
+
+/**
+ * Morphosyntactic annotations for surface forms.
+ * @see MorphosyntacticTagsAttribute
+ */
+public class MorphosyntacticTagsAttributeImpl extends AttributeImpl 
+  implements MorphosyntacticTagsAttribute, Cloneable {
+  
+  /**
+   * A list of potential tag variants for the current token.
+   */
+  private List<StringBuilder> tags;
+
+  /**
+   * Returns the POS tag of the term. If you need a copy of this char sequence, copy
+   * its contents (and clone {@link StringBuilder}s) because it changes with 
+   * each new term to avoid unnecessary memory allocations.
+   */
+  @Override
+  public List<StringBuilder> getTags() {
+    return tags;
+  }
+
+  public void clear() {
+    tags = null;
+  }
+
+  public boolean equals(Object other) {
+    if (other instanceof MorphosyntacticTagsAttribute) {
+      return equal(this.getTags(), ((MorphosyntacticTagsAttribute) other).getTags());
+    }
+    return false;
+  }
+
+  private boolean equal(Object l1, Object l2) {
+    return l1 == null ? (l2 == null) : (l1.equals(l2));
+  }
+
+  public int hashCode() {
+    return this.tags == null ? 0 : tags.hashCode();
+  }
+
+  /**
+   * Sets the internal tags reference to the given list. The contents
+   * is not copied. 
+   */
+  @Override
+  public void setTags(List<StringBuilder> tags) {
+    this.tags = tags;
+  }
+
+  public void copyTo(AttributeImpl target) {
+    List<StringBuilder> cloned = null;
+    if (tags != null) {
+      cloned = new ArrayList<StringBuilder>(tags.size());
+      for (StringBuilder b : tags) {
+        cloned.add(new StringBuilder(b));
+      }
+    }
+    ((MorphosyntacticTagsAttribute) target).setTags(cloned);
+  }
+
+  public MorphosyntacticTagsAttributeImpl clone() {
+    MorphosyntacticTagsAttributeImpl cloned = new MorphosyntacticTagsAttributeImpl();
+    this.copyTo(cloned);
+    return cloned;
+  }
+  
+  @Override
+  public String toString() {
+    return tags == null ? "<no tags>" : tags.toString();
+  }
+}
--- a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
+++ b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
@ -1,4 +1,3 @@
-// -*- c-basic-offset: 2 -*-
 package org.apache.lucene.analysis.morfologik;

 /*
@ -20,10 +19,9 @@ package org.apache.lucene.analysis.morfologik;

 import java.io.IOException;
 import java.io.StringReader;
+import java.util.TreeSet;

-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.*;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

 /**
@ -39,8 +37,8 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
  public final void testSingleTokens() throws IOException {
    Analyzer a = getTestAnalyzer();
    assertAnalyzesToReuse(a, "a", new String[] { "a" });
-    assertAnalyzesToReuse(a, "liście", new String[] { "liść", "list", "lista", });
-    assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dane", "dać" });
+    assertAnalyzesToReuse(a, "liście", new String[] { "liście", "liść", "list", "lista" });
+    assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dana", "dane", "dać" });
    assertAnalyzesToReuse(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń" });
  }

@ -50,10 +48,10 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
    assertAnalyzesToReuse(
      a,
      "liście danych",
-      new String[] { "liść", "list", "lista", "dany", "dane", "dać" },
-      new int[] { 0, 0, 0, 7, 7, 7 },
-      new int[] { 6, 6, 6, 13, 13, 13 },
-      new int[] { 1, 0, 0, 1, 0, 0 });
+      new String[] { "liście", "liść", "list", "lista", "dany", "dana", "dane", "dać" },
+      new int[] { 0, 0, 0, 0, 7, 7, 7, 7 },
+      new int[] { 6, 6, 6, 6, 13, 13, 13, 13 },
+      new int[] { 1, 0, 0, 0, 1, 0, 0, 0 });
  }

  /** Test reuse of MorfologikFilter with leftover stems. */
@ -63,7 +61,7 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
    CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
    ts_1.reset();
    ts_1.incrementToken();
-    assertEquals("first stream", "liść", termAtt_1.toString());
+    assertEquals("first stream", "liście", termAtt_1.toString());

    TokenStream ts_2 = a.tokenStream("dummy", new StringReader("danych"));
    CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
@ -76,31 +74,59 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
  public final void testCase() throws IOException {
    Analyzer a = getTestAnalyzer();

-    assertAnalyzesToReuse(a, "AGD",      new String[] { "artykuły gospodarstwa domowego" });
+    assertAnalyzesToReuse(a, "AGD",      new String[] { "AGD", "artykuły gospodarstwa domowego" });
    assertAnalyzesToReuse(a, "agd",      new String[] { "artykuły gospodarstwa domowego" });

    assertAnalyzesToReuse(a, "Poznania", new String[] { "Poznań" });
-    assertAnalyzesToReuse(a, "poznania", new String[] { "poznać" });
+    assertAnalyzesToReuse(a, "poznania", new String[] { "poznanie", "poznać" });

    assertAnalyzesToReuse(a, "Aarona",   new String[] { "Aaron" });
    assertAnalyzesToReuse(a, "aarona",   new String[] { "aarona" });

-    assertAnalyzesToReuse(a, "Liście",   new String[] { "liść", "list", "lista" });
+    assertAnalyzesToReuse(a, "Liście",   new String[] { "liście", "liść", "list", "lista" });
  }

-  private void assertPOSToken(TokenStream ts, String term, String pos) throws IOException {
+  private void assertPOSToken(TokenStream ts, String term, String... tags) throws IOException {
    ts.incrementToken();
    assertEquals(term, ts.getAttribute(CharTermAttribute.class).toString());
-    assertEquals(pos,  ts.getAttribute(MorphosyntacticTagAttribute.class).getTag().toString());
+    
+    TreeSet<String> actual = new TreeSet<String>();
+    TreeSet<String> expected = new TreeSet<String>();
+    for (StringBuilder b : ts.getAttribute(MorphosyntacticTagsAttribute.class).getTags()) {
+      actual.add(b.toString());
+    }
+    for (String s : tags) {
+      expected.add(s);
+    }
+    
+    if (!expected.equals(actual)) {
+      System.out.println("Expected:\n" + expected);
+      System.out.println("Actual:\n" + actual);
+      assertEquals(expected, actual);
+    }
  }

  /** Test morphosyntactic annotations. */
  public final void testPOSAttribute() throws IOException {
    TokenStream ts = getTestAnalyzer().tokenStream("dummy", new StringReader("liście"));

-    assertPOSToken(ts, "liść",  "subst:pl:acc.nom.voc:m3");
-    assertPOSToken(ts, "list",  "subst:sg:loc.voc:m3");
-    assertPOSToken(ts, "lista", "subst:sg:dat.loc:f");
+    assertPOSToken(ts, "liście",  
+        "subst:sg:acc:n2",
+        "subst:sg:nom:n2",
+        "subst:sg:voc:n2");
+
+    assertPOSToken(ts, "liść",  
+        "subst:pl:acc:m3",
+        "subst:pl:nom:m3",
+        "subst:pl:voc:m3");
+
+    assertPOSToken(ts, "list",  
+        "subst:sg:loc:m3",
+        "subst:sg:voc:m3");
+
+    assertPOSToken(ts, "lista", 
+        "subst:sg:dat:f",
+        "subst:sg:loc:f");
  }

  /** blast some random strings through the analyzer */
--- a/lucene/module-build.xml
+++ b/lucene/module-build.xml
@ -312,9 +312,7 @@
  <property name="analyzers-morfologik.jar" value="${common.dir}/build/analysis/morfologik/lucene-analyzers-morfologik-${version}.jar"/>
  <fileset id="analyzers-morfologik.fileset" dir="${common.dir}">
    <include name="build/analysis/morfologik/lucene-analyzers-morfologik-${version}.jar" />
-    <include name="analysis/morfologik/lib/morfologik-fsa-1.5.2.jar" />
-    <include name="analysis/morfologik/lib/morfologik-polish-1.5.2.jar" />
-    <include name="analysis/morfologik/lib/morfologik-stemming-1.5.2.jar" />
+    <include name="analysis/morfologik/lib/morfologik-*.jar" />
  </fileset>
  <target name="check-analyzers-morfologik-uptodate" unless="analyzers-morfologik.uptodate">
    <module-uptodate name="analysis/morfologik" jarfile="${analyzers-morfologik.jar}" property="analyzers-morfologik.uptodate"/>
--- a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestMorfologikFilterFactory.java
+++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestMorfologikFilterFactory.java
@ -7,7 +7,6 @@ import java.util.Map;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
-import org.apache.solr.schema.IndexSchema;

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more