mirror of https://github.com/apache/lucene.git
LUCENE-4138: Update morfologik (polish stemming) to release 1.5.3. Changed the way morphosyntactic tags are exposed (a list of tags for a single lemma instead of a compound tag).
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1354840 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
25dd0dd17b
commit
8127865e2d
|
@ -97,9 +97,9 @@
|
|||
<classpathentry kind="lib" path="lucene/sandbox/lib/jakarta-regexp-1.4.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/icu/lib/icu4j-4.8.1.1.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/phonetic/lib/commons-codec-1.6.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-polish-1.5.2.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-fsa-1.5.3.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-polish-1.5.3.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/benchmark/lib/commons-compress-1.2.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/benchmark/lib/xercesImpl-2.9.1.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>
|
||||
|
|
|
@ -303,7 +303,7 @@
|
|||
<dependency>
|
||||
<groupId>org.carrot2</groupId>
|
||||
<artifactId>morfologik-polish</artifactId>
|
||||
<version>1.5.2</version>
|
||||
<version>1.5.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.codehaus.woodstox</groupId>
|
||||
|
|
|
@ -7,6 +7,16 @@ http://s.apache.org/luceneversions
|
|||
======================= Lucene 5.0.0 =======================
|
||||
|
||||
|
||||
======================= Lucene 4.0.0-BETA =======================
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-4138: update of morfologik (Polish morphological analyzer) to 1.5.3.
|
||||
The tag attribute class has been renamed to MorphosyntacticTagsAttribute and
|
||||
has a different API (carries a list of tags instead of a compound tag). Upgrade
|
||||
of embedded morfologik dictionaries to version 1.9. (Dawid Weiss)
|
||||
|
||||
|
||||
======================= Lucene 4.0.0-ALPHA =======================
|
||||
|
||||
More information about this release, including any errata related to the
|
||||
|
|
|
@ -27,9 +27,9 @@
|
|||
|
||||
<path id="classpath">
|
||||
<pathelement path="${analyzers-common.jar}"/>
|
||||
<pathelement path="lib/morfologik-fsa-1.5.2.jar"/>
|
||||
<pathelement path="lib/morfologik-polish-1.5.2.jar"/>
|
||||
<pathelement path="lib/morfologik-stemming-1.5.2.jar"/>
|
||||
<pathelement path="lib/morfologik-fsa-1.5.3.jar"/>
|
||||
<pathelement path="lib/morfologik-polish-1.5.3.jar"/>
|
||||
<pathelement path="lib/morfologik-stemming-1.5.3.jar"/>
|
||||
<path refid="base.classpath"/>
|
||||
</path>
|
||||
|
||||
|
|
|
@ -19,9 +19,9 @@
|
|||
<ivy-module version="2.0">
|
||||
<info organisation="org.apache.lucene" module="analyzers-morfologik"/>
|
||||
<dependencies>
|
||||
<dependency org="org.carrot2" name="morfologik-polish" rev="1.5.2" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.5.2" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.5.2" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="morfologik-polish" rev="1.5.3" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.5.3" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.5.3" transitive="false"/>
|
||||
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
|
||||
</dependencies>
|
||||
</ivy-module>
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
1513ee81494d7856f607ff8fffc74b4c6cbe0d48
|
|
@ -0,0 +1 @@
|
|||
d1f729cd3019e6d86485226202f84458141a5688
|
|
@ -1,6 +1,6 @@
|
|||
|
||||
Copyright (c) 2006 Dawid Weiss
|
||||
Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
|
||||
Copyright (c) 2007-2012 Dawid Weiss, Marcin Miłkowski
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
ee23a00580efe973aafa6f2c225e52951832901b
|
|
@ -0,0 +1 @@
|
|||
8217b6f7ad018ceda0e824b2e60340000da4397a
|
|
@ -1,13 +1,33 @@
|
|||
morfologik-polish, TERMS OF LICENCE
|
||||
BSD-licensed dictionary of Polish (Morfologik)
|
||||
|
||||
This JAR contains and makes use of data from Polish ispell/myspell
|
||||
dictionaries hosted at http://www.sjp.pl/slownik/en/ and is
|
||||
licenced on the terms of (inter alia): GPL, LGPL, MPL or CC-SA licenses.
|
||||
Copyright (c) 2012, Marcin Miłkowski
|
||||
All rights reserved.
|
||||
|
||||
Part-of-speech tags were added in Morfologik project and are not found
|
||||
in the data from sjp.pl.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
-----
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
|
||||
OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
--
|
||||
|
||||
BSD-licensed dictionary of Polish (SGJP)
|
||||
http://sgjp.pl/morfeusz/
|
|
@ -1,9 +1,6 @@
|
|||
This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
|
||||
(http://morfologik.blogspot.com/).
|
||||
|
||||
This JAR contains and makes use of data from Polish ispell/myspell
|
||||
dictionaries hosted at http://www.sjp.pl/slownik/en/ and is
|
||||
licenced on the terms of (inter alia): GPL, LGPL, MPL or CC-SA licenses.
|
||||
This product includes data from BSD-licensed dictionary of Polish (Morfologik)
|
||||
(http://morfologik.blogspot.com/)
|
||||
|
||||
This product includes data from BSD-licensed dictionary of Polish (SGJP)
|
||||
(http://sgjp.pl/morfeusz/)
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
eba98b7cd049e07d55a64b180345954b62e42ec5
|
|
@ -0,0 +1 @@
|
|||
c4ead57b78fa71b00553ff21da6fb5a326e914e8
|
|
@ -1,6 +1,6 @@
|
|||
|
||||
Copyright (c) 2006 Dawid Weiss
|
||||
Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
|
||||
Copyright (c) 2007-2012 Dawid Weiss, Marcin Miłkowski
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
|
|
|
@ -19,8 +19,7 @@ package org.apache.lucene.analysis.morfologik;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
import morfologik.stemming.*;
|
||||
import morfologik.stemming.PolishStemmer.DICTIONARY;
|
||||
|
@ -30,13 +29,12 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util.*;
|
||||
|
||||
/**
|
||||
* {@link TokenFilter} using Morfologik library.
|
||||
*
|
||||
* MorfologikFilter contains a {@link MorphosyntacticTagAttribute}, which provides morphosyntactic
|
||||
* MorfologikFilter contains a {@link MorphosyntacticTagsAttribute}, which provides morphosyntactic
|
||||
* annotations for produced lemmas. See the Morfologik documentation for details.
|
||||
*
|
||||
* @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
|
||||
|
@ -44,7 +42,7 @@ import org.apache.lucene.util.Version;
|
|||
public class MorfologikFilter extends TokenFilter {
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final MorphosyntacticTagAttribute tagAtt = addAttribute(MorphosyntacticTagAttribute.class);
|
||||
private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
private final CharsRef scratch = new CharsRef(0);
|
||||
|
@ -55,6 +53,8 @@ public class MorfologikFilter extends TokenFilter {
|
|||
private final IStemmer stemmer;
|
||||
|
||||
private List<WordData> lemmaList;
|
||||
private final ArrayList<StringBuilder> tagsList = new ArrayList<StringBuilder>();
|
||||
|
||||
private int lemmaListIndex;
|
||||
|
||||
/**
|
||||
|
@ -73,9 +73,43 @@ public class MorfologikFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
private void popNextLemma() {
|
||||
// Collect all tags for the next unique lemma.
|
||||
CharSequence currentStem;
|
||||
int tags = 0;
|
||||
do {
|
||||
final WordData lemma = lemmaList.get(lemmaListIndex++);
|
||||
termAtt.setEmpty().append(lemma.getStem());
|
||||
tagAtt.setTag(lemma.getTag());
|
||||
currentStem = lemma.getStem();
|
||||
final CharSequence tag = lemma.getTag();
|
||||
if (tag != null) {
|
||||
if (tagsList.size() <= tags) {
|
||||
tagsList.add(new StringBuilder());
|
||||
}
|
||||
|
||||
final StringBuilder buffer = tagsList.get(tags++);
|
||||
buffer.setLength(0);
|
||||
buffer.append(lemma.getTag());
|
||||
}
|
||||
} while (lemmaListIndex < lemmaList.size() &&
|
||||
equalCharSequences(lemmaList.get(lemmaListIndex).getStem(), currentStem));
|
||||
|
||||
// Set the lemma's base form and tags as attributes.
|
||||
termAtt.setEmpty().append(currentStem);
|
||||
tagsAtt.setTags(tagsList.subList(0, tags));
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two char sequences for equality. Assumes non-null arguments.
|
||||
*/
|
||||
private static final boolean equalCharSequences(CharSequence s1, CharSequence s2) {
|
||||
int len1 = s1.length();
|
||||
int len2 = s2.length();
|
||||
if (len1 != len2) return false;
|
||||
for (int i = len1; --i >= 0;) {
|
||||
if (s1.charAt(i) != s2.charAt(i)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -101,7 +135,7 @@ public class MorfologikFilter extends TokenFilter {
|
|||
current = captureState();
|
||||
popNextLemma();
|
||||
} else {
|
||||
tagAtt.clear();
|
||||
tagsAtt.clear();
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
|
@ -130,6 +164,7 @@ public class MorfologikFilter extends TokenFilter {
|
|||
public void reset() throws IOException {
|
||||
lemmaListIndex = 0;
|
||||
lemmaList = Collections.emptyList();
|
||||
tagsList.clear();
|
||||
super.reset();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,92 +0,0 @@
|
|||
// -*- c-basic-offset: 2 -*-
|
||||
package org.apache.lucene.analysis.morfologik;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
* Morphosyntactic annotations for surface forms.
|
||||
* @see MorphosyntacticTagAttribute
|
||||
*/
|
||||
public class MorphosyntacticTagAttributeImpl extends AttributeImpl
|
||||
implements MorphosyntacticTagAttribute, Cloneable {
|
||||
|
||||
/**
|
||||
* Either the original tag from WordData or a clone.
|
||||
*/
|
||||
private CharSequence tag;
|
||||
|
||||
/**
|
||||
* Set the tag.
|
||||
*/
|
||||
public void setTag(CharSequence pos) {
|
||||
this.tag = ((pos == null || pos.length() == 0) ? null : pos);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the POS tag of the term. If you need a copy of this char sequence, clone it
|
||||
* because it may change with each new term!
|
||||
*/
|
||||
public CharSequence getTag() {
|
||||
return tag;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
tag = null;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other instanceof MorphosyntacticTagAttribute) {
|
||||
return equal(this.getTag(), ((MorphosyntacticTagAttribute) other).getTag());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if two char sequences are the same.
|
||||
*/
|
||||
private boolean equal(CharSequence chs1, CharSequence chs2) {
|
||||
if (chs1 == null && chs2 == null)
|
||||
return true;
|
||||
if (chs1 == null || chs2 == null)
|
||||
return false;
|
||||
int l1 = chs1.length();
|
||||
int l2 = chs2.length();
|
||||
if (l1 != l2)
|
||||
return false;
|
||||
for (int i = 0; i < l1; i++)
|
||||
if (chs1.charAt(i) != chs2.charAt(i))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return this.tag == null ? 0 : tag.hashCode();
|
||||
}
|
||||
|
||||
public void copyTo(AttributeImpl target) {
|
||||
((MorphosyntacticTagAttribute) target).setTag(this.tag);
|
||||
}
|
||||
|
||||
public MorphosyntacticTagAttributeImpl clone() {
|
||||
MorphosyntacticTagAttributeImpl cloned = new MorphosyntacticTagAttributeImpl();
|
||||
cloned.tag = (tag == null ? null : tag.toString());
|
||||
return cloned;
|
||||
}
|
||||
}
|
|
@ -18,6 +18,8 @@ package org.apache.lucene.analysis.morfologik;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/**
|
||||
|
@ -25,15 +27,18 @@ import org.apache.lucene.util.Attribute;
|
|||
* surface forms. For the exact format and description of these,
|
||||
* see the project's documentation (annotations vary by dictionary!).
|
||||
*/
|
||||
public interface MorphosyntacticTagAttribute extends Attribute {
|
||||
public interface MorphosyntacticTagsAttribute extends Attribute {
|
||||
/**
|
||||
* Set the POS tag. The default value (no-value) is null.
|
||||
* @param pos POS tag corresponding to current lemma
|
||||
*
|
||||
* @param tags A list of POS tags corresponding to current lemma.
|
||||
*/
|
||||
public void setTag(CharSequence pos);
|
||||
public void setTags(List<StringBuilder> tags);
|
||||
|
||||
/** Returns the POS tag of the term. */
|
||||
public CharSequence getTag();
|
||||
/**
|
||||
* Returns the POS tag of the term.
|
||||
*/
|
||||
public List<StringBuilder> getTags();
|
||||
|
||||
/** Clear to default value. */
|
||||
public void clear();
|
|
@ -0,0 +1,96 @@
|
|||
// -*- c-basic-offset: 2 -*-
|
||||
package org.apache.lucene.analysis.morfologik;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
* Morphosyntactic annotations for surface forms.
|
||||
* @see MorphosyntacticTagsAttribute
|
||||
*/
|
||||
public class MorphosyntacticTagsAttributeImpl extends AttributeImpl
|
||||
implements MorphosyntacticTagsAttribute, Cloneable {
|
||||
|
||||
/**
|
||||
* A list of potential tag variants for the current token.
|
||||
*/
|
||||
private List<StringBuilder> tags;
|
||||
|
||||
/**
|
||||
* Returns the POS tag of the term. If you need a copy of this char sequence, copy
|
||||
* its contents (and clone {@link StringBuilder}s) because it changes with
|
||||
* each new term to avoid unnecessary memory allocations.
|
||||
*/
|
||||
@Override
|
||||
public List<StringBuilder> getTags() {
|
||||
return tags;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
tags = null;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other instanceof MorphosyntacticTagsAttribute) {
|
||||
return equal(this.getTags(), ((MorphosyntacticTagsAttribute) other).getTags());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean equal(Object l1, Object l2) {
|
||||
return l1 == null ? (l2 == null) : (l1.equals(l2));
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return this.tags == null ? 0 : tags.hashCode();
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the internal tags reference to the given list. The contents
|
||||
* is not copied.
|
||||
*/
|
||||
@Override
|
||||
public void setTags(List<StringBuilder> tags) {
|
||||
this.tags = tags;
|
||||
}
|
||||
|
||||
public void copyTo(AttributeImpl target) {
|
||||
List<StringBuilder> cloned = null;
|
||||
if (tags != null) {
|
||||
cloned = new ArrayList<StringBuilder>(tags.size());
|
||||
for (StringBuilder b : tags) {
|
||||
cloned.add(new StringBuilder(b));
|
||||
}
|
||||
}
|
||||
((MorphosyntacticTagsAttribute) target).setTags(cloned);
|
||||
}
|
||||
|
||||
public MorphosyntacticTagsAttributeImpl clone() {
|
||||
MorphosyntacticTagsAttributeImpl cloned = new MorphosyntacticTagsAttributeImpl();
|
||||
this.copyTo(cloned);
|
||||
return cloned;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return tags == null ? "<no tags>" : tags.toString();
|
||||
}
|
||||
}
|
|
@ -1,4 +1,3 @@
|
|||
// -*- c-basic-offset: 2 -*-
|
||||
package org.apache.lucene.analysis.morfologik;
|
||||
|
||||
/*
|
||||
|
@ -20,10 +19,9 @@ package org.apache.lucene.analysis.morfologik;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/**
|
||||
|
@ -39,8 +37,8 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
|
|||
public final void testSingleTokens() throws IOException {
|
||||
Analyzer a = getTestAnalyzer();
|
||||
assertAnalyzesToReuse(a, "a", new String[] { "a" });
|
||||
assertAnalyzesToReuse(a, "liście", new String[] { "liść", "list", "lista", });
|
||||
assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dane", "dać" });
|
||||
assertAnalyzesToReuse(a, "liście", new String[] { "liście", "liść", "list", "lista" });
|
||||
assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dana", "dane", "dać" });
|
||||
assertAnalyzesToReuse(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń" });
|
||||
}
|
||||
|
||||
|
@ -50,10 +48,10 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesToReuse(
|
||||
a,
|
||||
"liście danych",
|
||||
new String[] { "liść", "list", "lista", "dany", "dane", "dać" },
|
||||
new int[] { 0, 0, 0, 7, 7, 7 },
|
||||
new int[] { 6, 6, 6, 13, 13, 13 },
|
||||
new int[] { 1, 0, 0, 1, 0, 0 });
|
||||
new String[] { "liście", "liść", "list", "lista", "dany", "dana", "dane", "dać" },
|
||||
new int[] { 0, 0, 0, 0, 7, 7, 7, 7 },
|
||||
new int[] { 6, 6, 6, 6, 13, 13, 13, 13 },
|
||||
new int[] { 1, 0, 0, 0, 1, 0, 0, 0 });
|
||||
}
|
||||
|
||||
/** Test reuse of MorfologikFilter with leftover stems. */
|
||||
|
@ -63,7 +61,7 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
|
||||
ts_1.reset();
|
||||
ts_1.incrementToken();
|
||||
assertEquals("first stream", "liść", termAtt_1.toString());
|
||||
assertEquals("first stream", "liście", termAtt_1.toString());
|
||||
|
||||
TokenStream ts_2 = a.tokenStream("dummy", new StringReader("danych"));
|
||||
CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
|
||||
|
@ -76,31 +74,59 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
|
|||
public final void testCase() throws IOException {
|
||||
Analyzer a = getTestAnalyzer();
|
||||
|
||||
assertAnalyzesToReuse(a, "AGD", new String[] { "artykuły gospodarstwa domowego" });
|
||||
assertAnalyzesToReuse(a, "AGD", new String[] { "AGD", "artykuły gospodarstwa domowego" });
|
||||
assertAnalyzesToReuse(a, "agd", new String[] { "artykuły gospodarstwa domowego" });
|
||||
|
||||
assertAnalyzesToReuse(a, "Poznania", new String[] { "Poznań" });
|
||||
assertAnalyzesToReuse(a, "poznania", new String[] { "poznać" });
|
||||
assertAnalyzesToReuse(a, "poznania", new String[] { "poznanie", "poznać" });
|
||||
|
||||
assertAnalyzesToReuse(a, "Aarona", new String[] { "Aaron" });
|
||||
assertAnalyzesToReuse(a, "aarona", new String[] { "aarona" });
|
||||
|
||||
assertAnalyzesToReuse(a, "Liście", new String[] { "liść", "list", "lista" });
|
||||
assertAnalyzesToReuse(a, "Liście", new String[] { "liście", "liść", "list", "lista" });
|
||||
}
|
||||
|
||||
private void assertPOSToken(TokenStream ts, String term, String pos) throws IOException {
|
||||
private void assertPOSToken(TokenStream ts, String term, String... tags) throws IOException {
|
||||
ts.incrementToken();
|
||||
assertEquals(term, ts.getAttribute(CharTermAttribute.class).toString());
|
||||
assertEquals(pos, ts.getAttribute(MorphosyntacticTagAttribute.class).getTag().toString());
|
||||
|
||||
TreeSet<String> actual = new TreeSet<String>();
|
||||
TreeSet<String> expected = new TreeSet<String>();
|
||||
for (StringBuilder b : ts.getAttribute(MorphosyntacticTagsAttribute.class).getTags()) {
|
||||
actual.add(b.toString());
|
||||
}
|
||||
for (String s : tags) {
|
||||
expected.add(s);
|
||||
}
|
||||
|
||||
if (!expected.equals(actual)) {
|
||||
System.out.println("Expected:\n" + expected);
|
||||
System.out.println("Actual:\n" + actual);
|
||||
assertEquals(expected, actual);
|
||||
}
|
||||
}
|
||||
|
||||
/** Test morphosyntactic annotations. */
|
||||
public final void testPOSAttribute() throws IOException {
|
||||
TokenStream ts = getTestAnalyzer().tokenStream("dummy", new StringReader("liście"));
|
||||
|
||||
assertPOSToken(ts, "liść", "subst:pl:acc.nom.voc:m3");
|
||||
assertPOSToken(ts, "list", "subst:sg:loc.voc:m3");
|
||||
assertPOSToken(ts, "lista", "subst:sg:dat.loc:f");
|
||||
assertPOSToken(ts, "liście",
|
||||
"subst:sg:acc:n2",
|
||||
"subst:sg:nom:n2",
|
||||
"subst:sg:voc:n2");
|
||||
|
||||
assertPOSToken(ts, "liść",
|
||||
"subst:pl:acc:m3",
|
||||
"subst:pl:nom:m3",
|
||||
"subst:pl:voc:m3");
|
||||
|
||||
assertPOSToken(ts, "list",
|
||||
"subst:sg:loc:m3",
|
||||
"subst:sg:voc:m3");
|
||||
|
||||
assertPOSToken(ts, "lista",
|
||||
"subst:sg:dat:f",
|
||||
"subst:sg:loc:f");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -312,9 +312,7 @@
|
|||
<property name="analyzers-morfologik.jar" value="${common.dir}/build/analysis/morfologik/lucene-analyzers-morfologik-${version}.jar"/>
|
||||
<fileset id="analyzers-morfologik.fileset" dir="${common.dir}">
|
||||
<include name="build/analysis/morfologik/lucene-analyzers-morfologik-${version}.jar" />
|
||||
<include name="analysis/morfologik/lib/morfologik-fsa-1.5.2.jar" />
|
||||
<include name="analysis/morfologik/lib/morfologik-polish-1.5.2.jar" />
|
||||
<include name="analysis/morfologik/lib/morfologik-stemming-1.5.2.jar" />
|
||||
<include name="analysis/morfologik/lib/morfologik-*.jar" />
|
||||
</fileset>
|
||||
<target name="check-analyzers-morfologik-uptodate" unless="analyzers-morfologik.uptodate">
|
||||
<module-uptodate name="analysis/morfologik" jarfile="${analyzers-morfologik.jar}" property="analyzers-morfologik.uptodate"/>
|
||||
|
|
|
@ -7,7 +7,6 @@ import java.util.Map;
|
|||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
|
Loading…
Reference in New Issue