diff --git a/dev-tools/eclipse/dot.classpath b/dev-tools/eclipse/dot.classpath
index e4544f93b0e..109fb19eefa 100644
--- a/dev-tools/eclipse/dot.classpath
+++ b/dev-tools/eclipse/dot.classpath
@@ -97,9 +97,9 @@
-
-
-
+
+
+
diff --git a/dev-tools/maven/pom.xml.template b/dev-tools/maven/pom.xml.template
index d3078cba4ed..40ac6c33f4f 100644
--- a/dev-tools/maven/pom.xml.template
+++ b/dev-tools/maven/pom.xml.template
@@ -303,7 +303,7 @@
org.carrot2
morfologik-polish
- 1.5.2
+ 1.5.3
org.codehaus.woodstox
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 01d96885b97..8bb2007df52 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -7,6 +7,16 @@ http://s.apache.org/luceneversions
======================= Lucene 5.0.0 =======================
+======================= Lucene 4.0.0-BETA =======================
+
+API Changes
+
+* LUCENE-4138: update of morfologik (Polish morphological analyzer) to 1.5.3.
+ The tag attribute class has been renamed to MorphosyntacticTagsAttribute and
+ has a different API (carries a list of tags instead of a compound tag). Upgrade
+ of embedded morfologik dictionaries to version 1.9. (Dawid Weiss)
+
+
======================= Lucene 4.0.0-ALPHA =======================
More information about this release, including any errata related to the
diff --git a/lucene/analysis/morfologik/build.xml b/lucene/analysis/morfologik/build.xml
index d7e5c5aca71..d4455c7fc58 100644
--- a/lucene/analysis/morfologik/build.xml
+++ b/lucene/analysis/morfologik/build.xml
@@ -27,9 +27,9 @@
-
-
-
+
+
+
diff --git a/lucene/analysis/morfologik/ivy.xml b/lucene/analysis/morfologik/ivy.xml
index f35e1ae2679..c5fa7219474 100644
--- a/lucene/analysis/morfologik/ivy.xml
+++ b/lucene/analysis/morfologik/ivy.xml
@@ -19,9 +19,9 @@
-
-
-
+
+
+
diff --git a/lucene/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar.sha1 b/lucene/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar.sha1
deleted file mode 100644
index e1828396c40..00000000000
--- a/lucene/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-1513ee81494d7856f607ff8fffc74b4c6cbe0d48
diff --git a/lucene/analysis/morfologik/lib/morfologik-fsa-1.5.3.jar.sha1 b/lucene/analysis/morfologik/lib/morfologik-fsa-1.5.3.jar.sha1
new file mode 100644
index 00000000000..3d3b86d5f8c
--- /dev/null
+++ b/lucene/analysis/morfologik/lib/morfologik-fsa-1.5.3.jar.sha1
@@ -0,0 +1 @@
+d1f729cd3019e6d86485226202f84458141a5688
diff --git a/lucene/analysis/morfologik/lib/morfologik-fsa-LICENSE-BSD.txt b/lucene/analysis/morfologik/lib/morfologik-fsa-LICENSE-BSD.txt
index 2684a835b73..f97fb7dfe38 100644
--- a/lucene/analysis/morfologik/lib/morfologik-fsa-LICENSE-BSD.txt
+++ b/lucene/analysis/morfologik/lib/morfologik-fsa-LICENSE-BSD.txt
@@ -1,6 +1,6 @@
Copyright (c) 2006 Dawid Weiss
-Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
+Copyright (c) 2007-2012 Dawid Weiss, Marcin Miłkowski
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
diff --git a/lucene/analysis/morfologik/lib/morfologik-polish-1.5.2.jar.sha1 b/lucene/analysis/morfologik/lib/morfologik-polish-1.5.2.jar.sha1
deleted file mode 100644
index be196fa6ee8..00000000000
--- a/lucene/analysis/morfologik/lib/morfologik-polish-1.5.2.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-ee23a00580efe973aafa6f2c225e52951832901b
diff --git a/lucene/analysis/morfologik/lib/morfologik-polish-1.5.3.jar.sha1 b/lucene/analysis/morfologik/lib/morfologik-polish-1.5.3.jar.sha1
new file mode 100644
index 00000000000..6eb48a47896
--- /dev/null
+++ b/lucene/analysis/morfologik/lib/morfologik-polish-1.5.3.jar.sha1
@@ -0,0 +1 @@
+8217b6f7ad018ceda0e824b2e60340000da4397a
diff --git a/lucene/analysis/morfologik/lib/morfologik-polish-LICENSE-COMPOUND.txt b/lucene/analysis/morfologik/lib/morfologik-polish-LICENSE-BSD.txt
similarity index 51%
rename from lucene/analysis/morfologik/lib/morfologik-polish-LICENSE-COMPOUND.txt
rename to lucene/analysis/morfologik/lib/morfologik-polish-LICENSE-BSD.txt
index 772cffe13d5..04ffd07ece9 100644
--- a/lucene/analysis/morfologik/lib/morfologik-polish-LICENSE-COMPOUND.txt
+++ b/lucene/analysis/morfologik/lib/morfologik-polish-LICENSE-BSD.txt
@@ -1,13 +1,33 @@
-morfologik-polish, TERMS OF LICENCE
+BSD-licensed dictionary of Polish (Morfologik)
-This JAR contains and makes use of data from Polish ispell/myspell
-dictionaries hosted at http://www.sjp.pl/slownik/en/ and is
-licenced on the terms of (inter alia): GPL, LGPL, MPL or CC-SA licenses.
+Copyright (c) 2012, Marcin Miłkowski
+All rights reserved.
-Part-of-speech tags were added in Morfologik project and are not found
-in the data from sjp.pl.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
------
+1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the
+ distribution.
+
+THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
+OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--
BSD-licensed dictionary of Polish (SGJP)
http://sgjp.pl/morfeusz/
@@ -39,4 +59,4 @@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
-IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/lucene/analysis/morfologik/lib/morfologik-polish-NOTICE.txt b/lucene/analysis/morfologik/lib/morfologik-polish-NOTICE.txt
index f4d3c6cc7d5..a8a3aa11a3d 100644
--- a/lucene/analysis/morfologik/lib/morfologik-polish-NOTICE.txt
+++ b/lucene/analysis/morfologik/lib/morfologik-polish-NOTICE.txt
@@ -1,9 +1,6 @@
-This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
-(http://morfologik.blogspot.com/).
-This JAR contains and makes use of data from Polish ispell/myspell
-dictionaries hosted at http://www.sjp.pl/slownik/en/ and is
-licenced on the terms of (inter alia): GPL, LGPL, MPL or CC-SA licenses.
+This product includes data from BSD-licensed dictionary of Polish (Morfologik)
+(http://morfologik.blogspot.com/)
This product includes data from BSD-licensed dictionary of Polish (SGJP)
(http://sgjp.pl/morfeusz/)
diff --git a/lucene/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar.sha1 b/lucene/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar.sha1
deleted file mode 100644
index 26d203ac0a6..00000000000
--- a/lucene/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-eba98b7cd049e07d55a64b180345954b62e42ec5
diff --git a/lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar.sha1 b/lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar.sha1
new file mode 100644
index 00000000000..c31642be45d
--- /dev/null
+++ b/lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar.sha1
@@ -0,0 +1 @@
+c4ead57b78fa71b00553ff21da6fb5a326e914e8
diff --git a/lucene/analysis/morfologik/lib/morfologik-stemming-LICENSE-BSD.txt b/lucene/analysis/morfologik/lib/morfologik-stemming-LICENSE-BSD.txt
index 2684a835b73..f97fb7dfe38 100644
--- a/lucene/analysis/morfologik/lib/morfologik-stemming-LICENSE-BSD.txt
+++ b/lucene/analysis/morfologik/lib/morfologik-stemming-LICENSE-BSD.txt
@@ -1,6 +1,6 @@
Copyright (c) 2006 Dawid Weiss
-Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
+Copyright (c) 2007-2012 Dawid Weiss, Marcin Miłkowski
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
index f7669f8446f..16ca41e5e53 100644
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
@@ -19,8 +19,7 @@ package org.apache.lucene.analysis.morfologik;
*/
import java.io.IOException;
-import java.util.Collections;
-import java.util.List;
+import java.util.*;
import morfologik.stemming.*;
import morfologik.stemming.PolishStemmer.DICTIONARY;
@@ -30,13 +29,12 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;
-import org.apache.lucene.util.CharsRef;
-import org.apache.lucene.util.Version;
+import org.apache.lucene.util.*;
/**
* {@link TokenFilter} using Morfologik library.
*
- * MorfologikFilter contains a {@link MorphosyntacticTagAttribute}, which provides morphosyntactic
+ * MorfologikFilter contains a {@link MorphosyntacticTagsAttribute}, which provides morphosyntactic
* annotations for produced lemmas. See the Morfologik documentation for details.
*
* @see Morfologik project page
@@ -44,7 +42,7 @@ import org.apache.lucene.util.Version;
public class MorfologikFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final MorphosyntacticTagAttribute tagAtt = addAttribute(MorphosyntacticTagAttribute.class);
+ private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final CharsRef scratch = new CharsRef(0);
@@ -55,6 +53,8 @@ public class MorfologikFilter extends TokenFilter {
private final IStemmer stemmer;
private List lemmaList;
+ private final ArrayList tagsList = new ArrayList();
+
private int lemmaListIndex;
/**
@@ -73,9 +73,43 @@ public class MorfologikFilter extends TokenFilter {
}
private void popNextLemma() {
- final WordData lemma = lemmaList.get(lemmaListIndex++);
- termAtt.setEmpty().append(lemma.getStem());
- tagAtt.setTag(lemma.getTag());
+ // Collect all tags for the next unique lemma.
+ CharSequence currentStem;
+ int tags = 0;
+ do {
+ final WordData lemma = lemmaList.get(lemmaListIndex++);
+ currentStem = lemma.getStem();
+ final CharSequence tag = lemma.getTag();
+ if (tag != null) {
+ if (tagsList.size() <= tags) {
+ tagsList.add(new StringBuilder());
+ }
+
+ final StringBuilder buffer = tagsList.get(tags++);
+ buffer.setLength(0);
+ buffer.append(lemma.getTag());
+ }
+ } while (lemmaListIndex < lemmaList.size() &&
+ equalCharSequences(lemmaList.get(lemmaListIndex).getStem(), currentStem));
+
+ // Set the lemma's base form and tags as attributes.
+ termAtt.setEmpty().append(currentStem);
+ tagsAtt.setTags(tagsList.subList(0, tags));
+ }
+
+ /**
+ * Compare two char sequences for equality. Assumes non-null arguments.
+ */
+ private static final boolean equalCharSequences(CharSequence s1, CharSequence s2) {
+ int len1 = s1.length();
+ int len2 = s2.length();
+ if (len1 != len2) return false;
+ for (int i = len1; --i >= 0;) {
+ if (s1.charAt(i) != s2.charAt(i)) {
+ return false;
+ }
+ }
+ return true;
}
/**
@@ -101,7 +135,7 @@ public class MorfologikFilter extends TokenFilter {
current = captureState();
popNextLemma();
} else {
- tagAtt.clear();
+ tagsAtt.clear();
}
return true;
} else {
@@ -130,6 +164,7 @@ public class MorfologikFilter extends TokenFilter {
public void reset() throws IOException {
lemmaListIndex = 0;
lemmaList = Collections.emptyList();
+ tagsList.clear();
super.reset();
}
}
diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttributeImpl.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttributeImpl.java
deleted file mode 100644
index 602d71568bf..00000000000
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttributeImpl.java
+++ /dev/null
@@ -1,92 +0,0 @@
-// -*- c-basic-offset: 2 -*-
-package org.apache.lucene.analysis.morfologik;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.util.AttributeImpl;
-
-/**
- * Morphosyntactic annotations for surface forms.
- * @see MorphosyntacticTagAttribute
- */
-public class MorphosyntacticTagAttributeImpl extends AttributeImpl
- implements MorphosyntacticTagAttribute, Cloneable {
-
- /**
- * Either the original tag from WordData or a clone.
- */
- private CharSequence tag;
-
- /**
- * Set the tag.
- */
- public void setTag(CharSequence pos) {
- this.tag = ((pos == null || pos.length() == 0) ? null : pos);
- }
-
- /**
- * Returns the POS tag of the term. If you need a copy of this char sequence, clone it
- * because it may change with each new term!
- */
- public CharSequence getTag() {
- return tag;
- }
-
- public void clear() {
- tag = null;
- }
-
- public boolean equals(Object other) {
- if (other instanceof MorphosyntacticTagAttribute) {
- return equal(this.getTag(), ((MorphosyntacticTagAttribute) other).getTag());
- }
- return false;
- }
-
- /**
- * Check if two char sequences are the same.
- */
- private boolean equal(CharSequence chs1, CharSequence chs2) {
- if (chs1 == null && chs2 == null)
- return true;
- if (chs1 == null || chs2 == null)
- return false;
- int l1 = chs1.length();
- int l2 = chs2.length();
- if (l1 != l2)
- return false;
- for (int i = 0; i < l1; i++)
- if (chs1.charAt(i) != chs2.charAt(i))
- return false;
- return true;
- }
-
- public int hashCode() {
- return this.tag == null ? 0 : tag.hashCode();
- }
-
- public void copyTo(AttributeImpl target) {
- ((MorphosyntacticTagAttribute) target).setTag(this.tag);
- }
-
- public MorphosyntacticTagAttributeImpl clone() {
- MorphosyntacticTagAttributeImpl cloned = new MorphosyntacticTagAttributeImpl();
- cloned.tag = (tag == null ? null : tag.toString());
- return cloned;
- }
-}
diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttribute.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java
similarity index 80%
rename from lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttribute.java
rename to lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java
index a6887cd2ead..295148837b8 100644
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttribute.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java
@@ -18,6 +18,8 @@ package org.apache.lucene.analysis.morfologik;
* limitations under the License.
*/
+import java.util.List;
+
import org.apache.lucene.util.Attribute;
/**
@@ -25,15 +27,18 @@ import org.apache.lucene.util.Attribute;
* surface forms. For the exact format and description of these,
* see the project's documentation (annotations vary by dictionary!).
*/
-public interface MorphosyntacticTagAttribute extends Attribute {
+public interface MorphosyntacticTagsAttribute extends Attribute {
/**
* Set the POS tag. The default value (no-value) is null.
- * @param pos POS tag corresponding to current lemma
+ *
+ * @param tags A list of POS tags corresponding to current lemma.
*/
- public void setTag(CharSequence pos);
+ public void setTags(List tags);
- /** Returns the POS tag of the term. */
- public CharSequence getTag();
+ /**
+ * Returns the POS tag of the term.
+ */
+ public List getTags();
/** Clear to default value. */
public void clear();
diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttributeImpl.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttributeImpl.java
new file mode 100644
index 00000000000..8e3a8c497bf
--- /dev/null
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttributeImpl.java
@@ -0,0 +1,96 @@
+// -*- c-basic-offset: 2 -*-
+package org.apache.lucene.analysis.morfologik;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.*;
+
+import org.apache.lucene.util.AttributeImpl;
+
+/**
+ * Morphosyntactic annotations for surface forms.
+ * @see MorphosyntacticTagsAttribute
+ */
+public class MorphosyntacticTagsAttributeImpl extends AttributeImpl
+ implements MorphosyntacticTagsAttribute, Cloneable {
+
+ /**
+ * A list of potential tag variants for the current token.
+ */
+ private List tags;
+
+ /**
+ * Returns the POS tag of the term. If you need a copy of this char sequence, copy
+ * its contents (and clone {@link StringBuilder}s) because it changes with
+ * each new term to avoid unnecessary memory allocations.
+ */
+ @Override
+ public List getTags() {
+ return tags;
+ }
+
+ public void clear() {
+ tags = null;
+ }
+
+ public boolean equals(Object other) {
+ if (other instanceof MorphosyntacticTagsAttribute) {
+ return equal(this.getTags(), ((MorphosyntacticTagsAttribute) other).getTags());
+ }
+ return false;
+ }
+
+ private boolean equal(Object l1, Object l2) {
+ return l1 == null ? (l2 == null) : (l1.equals(l2));
+ }
+
+ public int hashCode() {
+ return this.tags == null ? 0 : tags.hashCode();
+ }
+
+ /**
+ * Sets the internal tags reference to the given list. The contents
+ * is not copied.
+ */
+ @Override
+ public void setTags(List tags) {
+ this.tags = tags;
+ }
+
+ public void copyTo(AttributeImpl target) {
+ List cloned = null;
+ if (tags != null) {
+ cloned = new ArrayList(tags.size());
+ for (StringBuilder b : tags) {
+ cloned.add(new StringBuilder(b));
+ }
+ }
+ ((MorphosyntacticTagsAttribute) target).setTags(cloned);
+ }
+
+ public MorphosyntacticTagsAttributeImpl clone() {
+ MorphosyntacticTagsAttributeImpl cloned = new MorphosyntacticTagsAttributeImpl();
+ this.copyTo(cloned);
+ return cloned;
+ }
+
+ @Override
+ public String toString() {
+ return tags == null ? "" : tags.toString();
+ }
+}
diff --git a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
index 88a45cf8590..8cf99435e77 100644
--- a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
+++ b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
@@ -1,4 +1,3 @@
-// -*- c-basic-offset: 2 -*-
package org.apache.lucene.analysis.morfologik;
/*
@@ -20,10 +19,9 @@ package org.apache.lucene.analysis.morfologik;
import java.io.IOException;
import java.io.StringReader;
+import java.util.TreeSet;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
@@ -39,8 +37,8 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
public final void testSingleTokens() throws IOException {
Analyzer a = getTestAnalyzer();
assertAnalyzesToReuse(a, "a", new String[] { "a" });
- assertAnalyzesToReuse(a, "liście", new String[] { "liść", "list", "lista", });
- assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dane", "dać" });
+ assertAnalyzesToReuse(a, "liście", new String[] { "liście", "liść", "list", "lista" });
+ assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dana", "dane", "dać" });
assertAnalyzesToReuse(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń" });
}
@@ -50,10 +48,10 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
assertAnalyzesToReuse(
a,
"liście danych",
- new String[] { "liść", "list", "lista", "dany", "dane", "dać" },
- new int[] { 0, 0, 0, 7, 7, 7 },
- new int[] { 6, 6, 6, 13, 13, 13 },
- new int[] { 1, 0, 0, 1, 0, 0 });
+ new String[] { "liście", "liść", "list", "lista", "dany", "dana", "dane", "dać" },
+ new int[] { 0, 0, 0, 0, 7, 7, 7, 7 },
+ new int[] { 6, 6, 6, 6, 13, 13, 13, 13 },
+ new int[] { 1, 0, 0, 0, 1, 0, 0, 0 });
}
/** Test reuse of MorfologikFilter with leftover stems. */
@@ -63,7 +61,7 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
ts_1.reset();
ts_1.incrementToken();
- assertEquals("first stream", "liść", termAtt_1.toString());
+ assertEquals("first stream", "liście", termAtt_1.toString());
TokenStream ts_2 = a.tokenStream("dummy", new StringReader("danych"));
CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
@@ -76,33 +74,61 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
public final void testCase() throws IOException {
Analyzer a = getTestAnalyzer();
- assertAnalyzesToReuse(a, "AGD", new String[] { "artykuły gospodarstwa domowego" });
+ assertAnalyzesToReuse(a, "AGD", new String[] { "AGD", "artykuły gospodarstwa domowego" });
assertAnalyzesToReuse(a, "agd", new String[] { "artykuły gospodarstwa domowego" });
assertAnalyzesToReuse(a, "Poznania", new String[] { "Poznań" });
- assertAnalyzesToReuse(a, "poznania", new String[] { "poznać" });
+ assertAnalyzesToReuse(a, "poznania", new String[] { "poznanie", "poznać" });
assertAnalyzesToReuse(a, "Aarona", new String[] { "Aaron" });
assertAnalyzesToReuse(a, "aarona", new String[] { "aarona" });
- assertAnalyzesToReuse(a, "Liście", new String[] { "liść", "list", "lista" });
+ assertAnalyzesToReuse(a, "Liście", new String[] { "liście", "liść", "list", "lista" });
}
- private void assertPOSToken(TokenStream ts, String term, String pos) throws IOException {
+ private void assertPOSToken(TokenStream ts, String term, String... tags) throws IOException {
ts.incrementToken();
assertEquals(term, ts.getAttribute(CharTermAttribute.class).toString());
- assertEquals(pos, ts.getAttribute(MorphosyntacticTagAttribute.class).getTag().toString());
+
+ TreeSet actual = new TreeSet();
+ TreeSet expected = new TreeSet();
+ for (StringBuilder b : ts.getAttribute(MorphosyntacticTagsAttribute.class).getTags()) {
+ actual.add(b.toString());
+ }
+ for (String s : tags) {
+ expected.add(s);
+ }
+
+ if (!expected.equals(actual)) {
+ System.out.println("Expected:\n" + expected);
+ System.out.println("Actual:\n" + actual);
+ assertEquals(expected, actual);
+ }
}
/** Test morphosyntactic annotations. */
public final void testPOSAttribute() throws IOException {
TokenStream ts = getTestAnalyzer().tokenStream("dummy", new StringReader("liście"));
- assertPOSToken(ts, "liść", "subst:pl:acc.nom.voc:m3");
- assertPOSToken(ts, "list", "subst:sg:loc.voc:m3");
- assertPOSToken(ts, "lista", "subst:sg:dat.loc:f");
+ assertPOSToken(ts, "liście",
+ "subst:sg:acc:n2",
+ "subst:sg:nom:n2",
+ "subst:sg:voc:n2");
+
+ assertPOSToken(ts, "liść",
+ "subst:pl:acc:m3",
+ "subst:pl:nom:m3",
+ "subst:pl:voc:m3");
+
+ assertPOSToken(ts, "list",
+ "subst:sg:loc:m3",
+ "subst:sg:voc:m3");
+
+ assertPOSToken(ts, "lista",
+ "subst:sg:dat:f",
+ "subst:sg:loc:f");
}
-
+
/** blast some random strings through the analyzer */
public void testRandom() throws Exception {
checkRandomData(random(), getTestAnalyzer(), 10000 * RANDOM_MULTIPLIER);
diff --git a/lucene/module-build.xml b/lucene/module-build.xml
index 62cfd960aa7..771016af0ff 100644
--- a/lucene/module-build.xml
+++ b/lucene/module-build.xml
@@ -312,9 +312,7 @@
-
-
-
+
diff --git a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestMorfologikFilterFactory.java b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestMorfologikFilterFactory.java
index d3d52e3cc76..30b3ba018c0 100644
--- a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestMorfologikFilterFactory.java
+++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestMorfologikFilterFactory.java
@@ -7,7 +7,6 @@ import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
-import org.apache.solr.schema.IndexSchema;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more