lemmaList;
+ private int lemmaListIndex;
+
+ /**
+ * Builds a filter for given PolishStemmer.DICTIONARY enum.
+ *
+ * @param in input token stream
+ * @param dict PolishStemmer.DICTIONARY enum
+ * @param version Lucene version compatibility for lowercasing.
+ */
+ public MorfologikFilter(final TokenStream in, final DICTIONARY dict, final Version version) {
+ super(in);
+ this.input = in;
+ this.stemmer = new PolishStemmer(dict);
+ this.charUtils = CharacterUtils.getInstance(version);
+ this.lemmaList = Collections.emptyList();
+ }
+
+ private void popNextLemma() {
+ final WordData lemma = lemmaList.get(lemmaListIndex++);
+ termAtt.setEmpty().append(lemma.getStem());
+ tagAtt.setTag(lemma.getTag());
+ }
+
+ /**
+ * Lookup a given surface form of a token and update
+ * {@link #lemmaList} and {@link #lemmaListIndex} accordingly.
+ */
+ private boolean lookupSurfaceForm(CharSequence token) {
+ lemmaList = this.stemmer.lookup(token);
+ lemmaListIndex = 0;
+ return lemmaList.size() > 0;
+ }
+
+ /** Retrieves the next token (possibly from the list of lemmas). */
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if (lemmaListIndex < lemmaList.size()) {
+ restoreState(current);
+ posIncrAtt.setPositionIncrement(0);
+ popNextLemma();
+ return true;
+ } else if (this.input.incrementToken()) {
+ if (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt))) {
+ current = captureState();
+ popNextLemma();
+ } else {
+ tagAtt.clear();
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Convert to lowercase in-place.
+ */
+ private CharSequence toLowercase(CharSequence chs) {
+ final int length = scratch.length = chs.length();
+ scratch.grow(length);
+
+ char buffer[] = scratch.chars;
+ for (int i = 0; i < length;) {
+ i += Character.toChars(
+ Character.toLowerCase(charUtils.codePointAt(chs, i)), buffer, i);
+ }
+
+ return scratch;
+ }
+
+ /** Resets stems accumulator and hands over to superclass. */
+ @Override
+ public void reset() throws IOException {
+ lemmaListIndex = 0;
+ lemmaList = Collections.emptyList();
+ super.reset();
+ }
+}
diff --git a/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttribute.java b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttribute.java
new file mode 100644
index 00000000000..a1950eef43e
--- /dev/null
+++ b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttribute.java
@@ -0,0 +1,40 @@
+// -*- c-basic-offset: 2 -*-
+package org.apache.lucene.analysis.morfologik;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.Attribute;
+
+/**
+ * Morfologik dictionaries provide morphosyntactic annotations for
+ * surface forms. For the exact format and description of these,
+ * see the project's documentation (annotations vary by dictionary!).
+ */
+public interface MorphosyntacticTagAttribute extends Attribute {
+ /**
+ * Set the POS tag. The default value (no-value) is null.
+ * @param pos POS tag corresponding to current lemma
+ */
+ public void setTag(CharSequence pos);
+
+ /** Returns the POS tag of the term. */
+ public CharSequence getTag();
+
+ /** Clear to default value. */
+ public void clear();
+}
diff --git a/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttributeImpl.java b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttributeImpl.java
new file mode 100644
index 00000000000..11ff3d5fd36
--- /dev/null
+++ b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttributeImpl.java
@@ -0,0 +1,91 @@
+// -*- c-basic-offset: 2 -*-
+package org.apache.lucene.analysis.morfologik;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.AttributeImpl;
+
+/**
+ * @see MorphosyntacticTagAttribute
+ */
+public class MorphosyntacticTagAttributeImpl extends AttributeImpl
+ implements MorphosyntacticTagAttribute, Cloneable {
+
+ /**
+ * Either the original tag from WordData or a clone.
+ */
+ private CharSequence tag;
+
+ /**
+ * Set the tag.
+ */
+ public void setTag(CharSequence pos) {
+ this.tag = ((pos == null || pos.length() == 0) ? null : pos);
+ }
+
+ /**
+ * Returns the POS tag of the term. If you need a copy of this char sequence, clone it
+ * because it may change with each new term!
+ */
+ public CharSequence getTag() {
+ return tag;
+ }
+
+ public void clear() {
+ tag = null;
+ }
+
+ public boolean equals(Object other) {
+ if (other instanceof MorphosyntacticTagAttribute) {
+ return equal(this.getTag(), ((MorphosyntacticTagAttribute) other).getTag());
+ }
+ return false;
+ }
+
+ /**
+ * Check if two char sequences are the same.
+ */
+ private boolean equal(CharSequence chs1, CharSequence chs2) {
+ if (chs1 == null && chs2 == null)
+ return true;
+ if (chs1 == null || chs2 == null)
+ return false;
+ int l1 = chs1.length();
+ int l2 = chs2.length();
+ if (l1 != l2)
+ return false;
+ for (int i = 0; i < l1; i++)
+ if (chs1.charAt(i) != chs2.charAt(i))
+ return false;
+ return true;
+ }
+
+ public int hashCode() {
+ return this.tag == null ? 0 : tag.hashCode();
+ }
+
+ public void copyTo(AttributeImpl target) {
+ ((MorphosyntacticTagAttribute) target).setTag(this.tag);
+ }
+
+ public Object clone() {
+ MorphosyntacticTagAttributeImpl cloned = new MorphosyntacticTagAttributeImpl();
+ cloned.tag = (tag == null ? null : tag.toString());
+ return cloned;
+ }
+}
diff --git a/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/package.html b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/package.html
new file mode 100644
index 00000000000..6b67d0e4c34
--- /dev/null
+++ b/modules/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/package.html
@@ -0,0 +1,34 @@
+
+
+
+
+
+
+
+
+ This package provides dictionary-driven lemmatization ("accurate stemming")
+ filter and analyzer for the Polish Language, driven by the
+ Morfologik library developed
+ by Dawid Weiss and Marcin Miłkowski.
+
+
+ The MorfologikFilter yields one or more terms for each token. Each
+ of those terms is given the same position in the index.
+
+
+
diff --git a/modules/analysis/morfologik/src/java/overview.html b/modules/analysis/morfologik/src/java/overview.html
new file mode 100644
index 00000000000..6b67d0e4c34
--- /dev/null
+++ b/modules/analysis/morfologik/src/java/overview.html
@@ -0,0 +1,34 @@
+
+
+
+
+
+
+
+
+ This package provides dictionary-driven lemmatization ("accurate stemming")
+ filter and analyzer for the Polish Language, driven by the
+ Morfologik library developed
+ by Dawid Weiss and Marcin Miłkowski.
+
+
+ The MorfologikFilter yields one or more terms for each token. Each
+ of those terms is given the same position in the index.
+
+
+
diff --git a/modules/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java b/modules/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
new file mode 100644
index 00000000000..42154ba08ca
--- /dev/null
+++ b/modules/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
@@ -0,0 +1,105 @@
+// -*- c-basic-offset: 2 -*-
+package org.apache.lucene.analysis.morfologik;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * TODO: The tests below rely on the order of returned lemmas, which is probably not good.
+ */
+public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
+
+ private Analyzer getTestAnalyzer() {
+ return new MorfologikAnalyzer(TEST_VERSION_CURRENT);
+ }
+
+ /** Test stemming of single tokens with Morfologik library. */
+ public final void testSingleTokens() throws IOException {
+ Analyzer a = getTestAnalyzer();
+ assertAnalyzesToReuse(a, "a", new String[] { "a" });
+ assertAnalyzesToReuse(a, "liście", new String[] { "liść", "list", "lista", });
+ assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dane", "dać" });
+ assertAnalyzesToReuse(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń" });
+ }
+
+ /** Test stemming of multiple tokens and proper term metrics. */
+ public final void testMultipleTokens() throws IOException {
+ Analyzer a = getTestAnalyzer();
+ assertAnalyzesToReuse(
+ a,
+ "liście danych",
+ new String[] { "liść", "list", "lista", "dany", "dane", "dać" },
+ new int[] { 0, 0, 0, 7, 7, 7 },
+ new int[] { 6, 6, 6, 13, 13, 13 },
+ new int[] { 1, 0, 0, 1, 0, 0 });
+ }
+
+ /** Test reuse of MorfologikFilter with leftover stems. */
+ public final void testLeftoverStems() throws IOException {
+ Analyzer a = getTestAnalyzer();
+ TokenStream ts_1 = a.reusableTokenStream("dummy", new StringReader("liście"));
+ CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
+ ts_1.reset();
+ ts_1.incrementToken();
+ assertEquals("first stream", "liść", termAtt_1.toString());
+
+ TokenStream ts_2 = a.reusableTokenStream("dummy", new StringReader("danych"));
+ CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
+ ts_2.reset();
+ ts_2.incrementToken();
+ assertEquals("second stream", "dany", termAtt_2.toString());
+ }
+
+ /** Test stemming of mixed-case tokens. */
+ public final void testCase() throws IOException {
+ Analyzer a = getTestAnalyzer();
+
+ assertAnalyzesToReuse(a, "AGD", new String[] { "artykuły gospodarstwa domowego" });
+ assertAnalyzesToReuse(a, "agd", new String[] { "artykuły gospodarstwa domowego" });
+
+ assertAnalyzesToReuse(a, "Poznania", new String[] { "Poznań" });
+ assertAnalyzesToReuse(a, "poznania", new String[] { "poznać" });
+
+ assertAnalyzesToReuse(a, "Aarona", new String[] { "Aaron" });
+ assertAnalyzesToReuse(a, "aarona", new String[] { "aarona" });
+
+ assertAnalyzesToReuse(a, "Liście", new String[] { "liść", "list", "lista" });
+ }
+
+ private void assertPOSToken(TokenStream ts, String term, String pos) throws IOException {
+ ts.incrementToken();
+ assertEquals(term, ts.getAttribute(CharTermAttribute.class).toString());
+ assertEquals(pos, ts.getAttribute(MorphosyntacticTagAttribute.class).getTag().toString());
+ }
+
+ /** Test morphosyntactic annotations. */
+ public final void testPOSAttribute() throws IOException {
+ TokenStream ts = getTestAnalyzer().reusableTokenStream("dummy", new StringReader("liście"));
+
+ assertPOSToken(ts, "liść", "subst:pl:acc.nom.voc:m3");
+ assertPOSToken(ts, "list", "subst:sg:loc.voc:m3");
+ assertPOSToken(ts, "lista", "subst:sg:dat.loc:f");
+ }
+}