mirror of https://github.com/apache/lucene.git
LUCENE-4138: Update morfologik (polish stemming) to release 1.5.3. Changed the way morphosyntactic tags are exposed (a list of tags for a single lemma instead of a compound tag).
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1354840 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
25dd0dd17b
commit
8127865e2d
|
@ -97,9 +97,9 @@
|
||||||
<classpathentry kind="lib" path="lucene/sandbox/lib/jakarta-regexp-1.4.jar"/>
|
<classpathentry kind="lib" path="lucene/sandbox/lib/jakarta-regexp-1.4.jar"/>
|
||||||
<classpathentry kind="lib" path="lucene/analysis/icu/lib/icu4j-4.8.1.1.jar"/>
|
<classpathentry kind="lib" path="lucene/analysis/icu/lib/icu4j-4.8.1.1.jar"/>
|
||||||
<classpathentry kind="lib" path="lucene/analysis/phonetic/lib/commons-codec-1.6.jar"/>
|
<classpathentry kind="lib" path="lucene/analysis/phonetic/lib/commons-codec-1.6.jar"/>
|
||||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar"/>
|
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-fsa-1.5.3.jar"/>
|
||||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-polish-1.5.2.jar"/>
|
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-polish-1.5.3.jar"/>
|
||||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar"/>
|
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar"/>
|
||||||
<classpathentry kind="lib" path="lucene/benchmark/lib/commons-compress-1.2.jar"/>
|
<classpathentry kind="lib" path="lucene/benchmark/lib/commons-compress-1.2.jar"/>
|
||||||
<classpathentry kind="lib" path="lucene/benchmark/lib/xercesImpl-2.9.1.jar"/>
|
<classpathentry kind="lib" path="lucene/benchmark/lib/xercesImpl-2.9.1.jar"/>
|
||||||
<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>
|
<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>
|
||||||
|
|
|
@ -303,7 +303,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.carrot2</groupId>
|
<groupId>org.carrot2</groupId>
|
||||||
<artifactId>morfologik-polish</artifactId>
|
<artifactId>morfologik-polish</artifactId>
|
||||||
<version>1.5.2</version>
|
<version>1.5.3</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.codehaus.woodstox</groupId>
|
<groupId>org.codehaus.woodstox</groupId>
|
||||||
|
|
|
@ -7,6 +7,16 @@ http://s.apache.org/luceneversions
|
||||||
======================= Lucene 5.0.0 =======================
|
======================= Lucene 5.0.0 =======================
|
||||||
|
|
||||||
|
|
||||||
|
======================= Lucene 4.0.0-BETA =======================
|
||||||
|
|
||||||
|
API Changes
|
||||||
|
|
||||||
|
* LUCENE-4138: update of morfologik (Polish morphological analyzer) to 1.5.3.
|
||||||
|
The tag attribute class has been renamed to MorphosyntacticTagsAttribute and
|
||||||
|
has a different API (carries a list of tags instead of a compound tag). Upgrade
|
||||||
|
of embedded morfologik dictionaries to version 1.9. (Dawid Weiss)
|
||||||
|
|
||||||
|
|
||||||
======================= Lucene 4.0.0-ALPHA =======================
|
======================= Lucene 4.0.0-ALPHA =======================
|
||||||
|
|
||||||
More information about this release, including any errata related to the
|
More information about this release, including any errata related to the
|
||||||
|
|
|
@ -27,9 +27,9 @@
|
||||||
|
|
||||||
<path id="classpath">
|
<path id="classpath">
|
||||||
<pathelement path="${analyzers-common.jar}"/>
|
<pathelement path="${analyzers-common.jar}"/>
|
||||||
<pathelement path="lib/morfologik-fsa-1.5.2.jar"/>
|
<pathelement path="lib/morfologik-fsa-1.5.3.jar"/>
|
||||||
<pathelement path="lib/morfologik-polish-1.5.2.jar"/>
|
<pathelement path="lib/morfologik-polish-1.5.3.jar"/>
|
||||||
<pathelement path="lib/morfologik-stemming-1.5.2.jar"/>
|
<pathelement path="lib/morfologik-stemming-1.5.3.jar"/>
|
||||||
<path refid="base.classpath"/>
|
<path refid="base.classpath"/>
|
||||||
</path>
|
</path>
|
||||||
|
|
||||||
|
|
|
@ -19,9 +19,9 @@
|
||||||
<ivy-module version="2.0">
|
<ivy-module version="2.0">
|
||||||
<info organisation="org.apache.lucene" module="analyzers-morfologik"/>
|
<info organisation="org.apache.lucene" module="analyzers-morfologik"/>
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency org="org.carrot2" name="morfologik-polish" rev="1.5.2" transitive="false"/>
|
<dependency org="org.carrot2" name="morfologik-polish" rev="1.5.3" transitive="false"/>
|
||||||
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.5.2" transitive="false"/>
|
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.5.3" transitive="false"/>
|
||||||
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.5.2" transitive="false"/>
|
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.5.3" transitive="false"/>
|
||||||
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
|
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</ivy-module>
|
</ivy-module>
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
1513ee81494d7856f607ff8fffc74b4c6cbe0d48
|
|
|
@ -0,0 +1 @@
|
||||||
|
d1f729cd3019e6d86485226202f84458141a5688
|
|
@ -1,6 +1,6 @@
|
||||||
|
|
||||||
Copyright (c) 2006 Dawid Weiss
|
Copyright (c) 2006 Dawid Weiss
|
||||||
Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
|
Copyright (c) 2007-2012 Dawid Weiss, Marcin Miłkowski
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without modification,
|
Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
ee23a00580efe973aafa6f2c225e52951832901b
|
|
|
@ -0,0 +1 @@
|
||||||
|
8217b6f7ad018ceda0e824b2e60340000da4397a
|
|
@ -1,13 +1,33 @@
|
||||||
morfologik-polish, TERMS OF LICENCE
|
BSD-licensed dictionary of Polish (Morfologik)
|
||||||
|
|
||||||
This JAR contains and makes use of data from Polish ispell/myspell
|
Copyright (c) 2012, Marcin Miłkowski
|
||||||
dictionaries hosted at http://www.sjp.pl/slownik/en/ and is
|
All rights reserved.
|
||||||
licenced on the terms of (inter alia): GPL, LGPL, MPL or CC-SA licenses.
|
|
||||||
|
|
||||||
Part-of-speech tags were added in Morfologik project and are not found
|
Redistribution and use in source and binary forms, with or without
|
||||||
in the data from sjp.pl.
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
-----
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
|
||||||
|
OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||||
|
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||||
|
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||||
|
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||||
|
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
--
|
||||||
|
|
||||||
BSD-licensed dictionary of Polish (SGJP)
|
BSD-licensed dictionary of Polish (SGJP)
|
||||||
http://sgjp.pl/morfeusz/
|
http://sgjp.pl/morfeusz/
|
||||||
|
@ -39,4 +59,4 @@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||||
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||||
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -1,9 +1,6 @@
|
||||||
This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
|
|
||||||
(http://morfologik.blogspot.com/).
|
|
||||||
|
|
||||||
This JAR contains and makes use of data from Polish ispell/myspell
|
This product includes data from BSD-licensed dictionary of Polish (Morfologik)
|
||||||
dictionaries hosted at http://www.sjp.pl/slownik/en/ and is
|
(http://morfologik.blogspot.com/)
|
||||||
licenced on the terms of (inter alia): GPL, LGPL, MPL or CC-SA licenses.
|
|
||||||
|
|
||||||
This product includes data from BSD-licensed dictionary of Polish (SGJP)
|
This product includes data from BSD-licensed dictionary of Polish (SGJP)
|
||||||
(http://sgjp.pl/morfeusz/)
|
(http://sgjp.pl/morfeusz/)
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
eba98b7cd049e07d55a64b180345954b62e42ec5
|
|
|
@ -0,0 +1 @@
|
||||||
|
c4ead57b78fa71b00553ff21da6fb5a326e914e8
|
|
@ -1,6 +1,6 @@
|
||||||
|
|
||||||
Copyright (c) 2006 Dawid Weiss
|
Copyright (c) 2006 Dawid Weiss
|
||||||
Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
|
Copyright (c) 2007-2012 Dawid Weiss, Marcin Miłkowski
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without modification,
|
Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
|
|
@ -19,8 +19,7 @@ package org.apache.lucene.analysis.morfologik;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Collections;
|
import java.util.*;
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import morfologik.stemming.*;
|
import morfologik.stemming.*;
|
||||||
import morfologik.stemming.PolishStemmer.DICTIONARY;
|
import morfologik.stemming.PolishStemmer.DICTIONARY;
|
||||||
|
@ -30,13 +29,12 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||||
import org.apache.lucene.util.CharsRef;
|
import org.apache.lucene.util.*;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* {@link TokenFilter} using Morfologik library.
|
* {@link TokenFilter} using Morfologik library.
|
||||||
*
|
*
|
||||||
* MorfologikFilter contains a {@link MorphosyntacticTagAttribute}, which provides morphosyntactic
|
* MorfologikFilter contains a {@link MorphosyntacticTagsAttribute}, which provides morphosyntactic
|
||||||
* annotations for produced lemmas. See the Morfologik documentation for details.
|
* annotations for produced lemmas. See the Morfologik documentation for details.
|
||||||
*
|
*
|
||||||
* @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
|
* @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
|
||||||
|
@ -44,7 +42,7 @@ import org.apache.lucene.util.Version;
|
||||||
public class MorfologikFilter extends TokenFilter {
|
public class MorfologikFilter extends TokenFilter {
|
||||||
|
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final MorphosyntacticTagAttribute tagAtt = addAttribute(MorphosyntacticTagAttribute.class);
|
private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class);
|
||||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
private final CharsRef scratch = new CharsRef(0);
|
private final CharsRef scratch = new CharsRef(0);
|
||||||
|
@ -55,6 +53,8 @@ public class MorfologikFilter extends TokenFilter {
|
||||||
private final IStemmer stemmer;
|
private final IStemmer stemmer;
|
||||||
|
|
||||||
private List<WordData> lemmaList;
|
private List<WordData> lemmaList;
|
||||||
|
private final ArrayList<StringBuilder> tagsList = new ArrayList<StringBuilder>();
|
||||||
|
|
||||||
private int lemmaListIndex;
|
private int lemmaListIndex;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -73,9 +73,43 @@ public class MorfologikFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void popNextLemma() {
|
private void popNextLemma() {
|
||||||
final WordData lemma = lemmaList.get(lemmaListIndex++);
|
// Collect all tags for the next unique lemma.
|
||||||
termAtt.setEmpty().append(lemma.getStem());
|
CharSequence currentStem;
|
||||||
tagAtt.setTag(lemma.getTag());
|
int tags = 0;
|
||||||
|
do {
|
||||||
|
final WordData lemma = lemmaList.get(lemmaListIndex++);
|
||||||
|
currentStem = lemma.getStem();
|
||||||
|
final CharSequence tag = lemma.getTag();
|
||||||
|
if (tag != null) {
|
||||||
|
if (tagsList.size() <= tags) {
|
||||||
|
tagsList.add(new StringBuilder());
|
||||||
|
}
|
||||||
|
|
||||||
|
final StringBuilder buffer = tagsList.get(tags++);
|
||||||
|
buffer.setLength(0);
|
||||||
|
buffer.append(lemma.getTag());
|
||||||
|
}
|
||||||
|
} while (lemmaListIndex < lemmaList.size() &&
|
||||||
|
equalCharSequences(lemmaList.get(lemmaListIndex).getStem(), currentStem));
|
||||||
|
|
||||||
|
// Set the lemma's base form and tags as attributes.
|
||||||
|
termAtt.setEmpty().append(currentStem);
|
||||||
|
tagsAtt.setTags(tagsList.subList(0, tags));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compare two char sequences for equality. Assumes non-null arguments.
|
||||||
|
*/
|
||||||
|
private static final boolean equalCharSequences(CharSequence s1, CharSequence s2) {
|
||||||
|
int len1 = s1.length();
|
||||||
|
int len2 = s2.length();
|
||||||
|
if (len1 != len2) return false;
|
||||||
|
for (int i = len1; --i >= 0;) {
|
||||||
|
if (s1.charAt(i) != s2.charAt(i)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -101,7 +135,7 @@ public class MorfologikFilter extends TokenFilter {
|
||||||
current = captureState();
|
current = captureState();
|
||||||
popNextLemma();
|
popNextLemma();
|
||||||
} else {
|
} else {
|
||||||
tagAtt.clear();
|
tagsAtt.clear();
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
|
@ -130,6 +164,7 @@ public class MorfologikFilter extends TokenFilter {
|
||||||
public void reset() throws IOException {
|
public void reset() throws IOException {
|
||||||
lemmaListIndex = 0;
|
lemmaListIndex = 0;
|
||||||
lemmaList = Collections.emptyList();
|
lemmaList = Collections.emptyList();
|
||||||
|
tagsList.clear();
|
||||||
super.reset();
|
super.reset();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,92 +0,0 @@
|
||||||
// -*- c-basic-offset: 2 -*-
|
|
||||||
package org.apache.lucene.analysis.morfologik;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import org.apache.lucene.util.AttributeImpl;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Morphosyntactic annotations for surface forms.
|
|
||||||
* @see MorphosyntacticTagAttribute
|
|
||||||
*/
|
|
||||||
public class MorphosyntacticTagAttributeImpl extends AttributeImpl
|
|
||||||
implements MorphosyntacticTagAttribute, Cloneable {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Either the original tag from WordData or a clone.
|
|
||||||
*/
|
|
||||||
private CharSequence tag;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Set the tag.
|
|
||||||
*/
|
|
||||||
public void setTag(CharSequence pos) {
|
|
||||||
this.tag = ((pos == null || pos.length() == 0) ? null : pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the POS tag of the term. If you need a copy of this char sequence, clone it
|
|
||||||
* because it may change with each new term!
|
|
||||||
*/
|
|
||||||
public CharSequence getTag() {
|
|
||||||
return tag;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void clear() {
|
|
||||||
tag = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean equals(Object other) {
|
|
||||||
if (other instanceof MorphosyntacticTagAttribute) {
|
|
||||||
return equal(this.getTag(), ((MorphosyntacticTagAttribute) other).getTag());
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Check if two char sequences are the same.
|
|
||||||
*/
|
|
||||||
private boolean equal(CharSequence chs1, CharSequence chs2) {
|
|
||||||
if (chs1 == null && chs2 == null)
|
|
||||||
return true;
|
|
||||||
if (chs1 == null || chs2 == null)
|
|
||||||
return false;
|
|
||||||
int l1 = chs1.length();
|
|
||||||
int l2 = chs2.length();
|
|
||||||
if (l1 != l2)
|
|
||||||
return false;
|
|
||||||
for (int i = 0; i < l1; i++)
|
|
||||||
if (chs1.charAt(i) != chs2.charAt(i))
|
|
||||||
return false;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int hashCode() {
|
|
||||||
return this.tag == null ? 0 : tag.hashCode();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void copyTo(AttributeImpl target) {
|
|
||||||
((MorphosyntacticTagAttribute) target).setTag(this.tag);
|
|
||||||
}
|
|
||||||
|
|
||||||
public MorphosyntacticTagAttributeImpl clone() {
|
|
||||||
MorphosyntacticTagAttributeImpl cloned = new MorphosyntacticTagAttributeImpl();
|
|
||||||
cloned.tag = (tag == null ? null : tag.toString());
|
|
||||||
return cloned;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -18,6 +18,8 @@ package org.apache.lucene.analysis.morfologik;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.util.Attribute;
|
import org.apache.lucene.util.Attribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -25,15 +27,18 @@ import org.apache.lucene.util.Attribute;
|
||||||
* surface forms. For the exact format and description of these,
|
* surface forms. For the exact format and description of these,
|
||||||
* see the project's documentation (annotations vary by dictionary!).
|
* see the project's documentation (annotations vary by dictionary!).
|
||||||
*/
|
*/
|
||||||
public interface MorphosyntacticTagAttribute extends Attribute {
|
public interface MorphosyntacticTagsAttribute extends Attribute {
|
||||||
/**
|
/**
|
||||||
* Set the POS tag. The default value (no-value) is null.
|
* Set the POS tag. The default value (no-value) is null.
|
||||||
* @param pos POS tag corresponding to current lemma
|
*
|
||||||
|
* @param tags A list of POS tags corresponding to current lemma.
|
||||||
*/
|
*/
|
||||||
public void setTag(CharSequence pos);
|
public void setTags(List<StringBuilder> tags);
|
||||||
|
|
||||||
/** Returns the POS tag of the term. */
|
/**
|
||||||
public CharSequence getTag();
|
* Returns the POS tag of the term.
|
||||||
|
*/
|
||||||
|
public List<StringBuilder> getTags();
|
||||||
|
|
||||||
/** Clear to default value. */
|
/** Clear to default value. */
|
||||||
public void clear();
|
public void clear();
|
|
@ -0,0 +1,96 @@
|
||||||
|
// -*- c-basic-offset: 2 -*-
|
||||||
|
package org.apache.lucene.analysis.morfologik;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.AttributeImpl;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Morphosyntactic annotations for surface forms.
|
||||||
|
* @see MorphosyntacticTagsAttribute
|
||||||
|
*/
|
||||||
|
public class MorphosyntacticTagsAttributeImpl extends AttributeImpl
|
||||||
|
implements MorphosyntacticTagsAttribute, Cloneable {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A list of potential tag variants for the current token.
|
||||||
|
*/
|
||||||
|
private List<StringBuilder> tags;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the POS tag of the term. If you need a copy of this char sequence, copy
|
||||||
|
* its contents (and clone {@link StringBuilder}s) because it changes with
|
||||||
|
* each new term to avoid unnecessary memory allocations.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public List<StringBuilder> getTags() {
|
||||||
|
return tags;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void clear() {
|
||||||
|
tags = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if (other instanceof MorphosyntacticTagsAttribute) {
|
||||||
|
return equal(this.getTags(), ((MorphosyntacticTagsAttribute) other).getTags());
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean equal(Object l1, Object l2) {
|
||||||
|
return l1 == null ? (l2 == null) : (l1.equals(l2));
|
||||||
|
}
|
||||||
|
|
||||||
|
public int hashCode() {
|
||||||
|
return this.tags == null ? 0 : tags.hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the internal tags reference to the given list. The contents
|
||||||
|
* is not copied.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public void setTags(List<StringBuilder> tags) {
|
||||||
|
this.tags = tags;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void copyTo(AttributeImpl target) {
|
||||||
|
List<StringBuilder> cloned = null;
|
||||||
|
if (tags != null) {
|
||||||
|
cloned = new ArrayList<StringBuilder>(tags.size());
|
||||||
|
for (StringBuilder b : tags) {
|
||||||
|
cloned.add(new StringBuilder(b));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
((MorphosyntacticTagsAttribute) target).setTags(cloned);
|
||||||
|
}
|
||||||
|
|
||||||
|
public MorphosyntacticTagsAttributeImpl clone() {
|
||||||
|
MorphosyntacticTagsAttributeImpl cloned = new MorphosyntacticTagsAttributeImpl();
|
||||||
|
this.copyTo(cloned);
|
||||||
|
return cloned;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return tags == null ? "<no tags>" : tags.toString();
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,4 +1,3 @@
|
||||||
// -*- c-basic-offset: 2 -*-
|
|
||||||
package org.apache.lucene.analysis.morfologik;
|
package org.apache.lucene.analysis.morfologik;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -20,10 +19,9 @@ package org.apache.lucene.analysis.morfologik;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
import java.util.TreeSet;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.*;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -39,8 +37,8 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public final void testSingleTokens() throws IOException {
|
public final void testSingleTokens() throws IOException {
|
||||||
Analyzer a = getTestAnalyzer();
|
Analyzer a = getTestAnalyzer();
|
||||||
assertAnalyzesToReuse(a, "a", new String[] { "a" });
|
assertAnalyzesToReuse(a, "a", new String[] { "a" });
|
||||||
assertAnalyzesToReuse(a, "liście", new String[] { "liść", "list", "lista", });
|
assertAnalyzesToReuse(a, "liście", new String[] { "liście", "liść", "list", "lista" });
|
||||||
assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dane", "dać" });
|
assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dana", "dane", "dać" });
|
||||||
assertAnalyzesToReuse(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń" });
|
assertAnalyzesToReuse(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -50,10 +48,10 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
|
||||||
assertAnalyzesToReuse(
|
assertAnalyzesToReuse(
|
||||||
a,
|
a,
|
||||||
"liście danych",
|
"liście danych",
|
||||||
new String[] { "liść", "list", "lista", "dany", "dane", "dać" },
|
new String[] { "liście", "liść", "list", "lista", "dany", "dana", "dane", "dać" },
|
||||||
new int[] { 0, 0, 0, 7, 7, 7 },
|
new int[] { 0, 0, 0, 0, 7, 7, 7, 7 },
|
||||||
new int[] { 6, 6, 6, 13, 13, 13 },
|
new int[] { 6, 6, 6, 6, 13, 13, 13, 13 },
|
||||||
new int[] { 1, 0, 0, 1, 0, 0 });
|
new int[] { 1, 0, 0, 0, 1, 0, 0, 0 });
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Test reuse of MorfologikFilter with leftover stems. */
|
/** Test reuse of MorfologikFilter with leftover stems. */
|
||||||
|
@ -63,7 +61,7 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
|
||||||
CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
|
||||||
ts_1.reset();
|
ts_1.reset();
|
||||||
ts_1.incrementToken();
|
ts_1.incrementToken();
|
||||||
assertEquals("first stream", "liść", termAtt_1.toString());
|
assertEquals("first stream", "liście", termAtt_1.toString());
|
||||||
|
|
||||||
TokenStream ts_2 = a.tokenStream("dummy", new StringReader("danych"));
|
TokenStream ts_2 = a.tokenStream("dummy", new StringReader("danych"));
|
||||||
CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
|
||||||
|
@ -76,33 +74,61 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public final void testCase() throws IOException {
|
public final void testCase() throws IOException {
|
||||||
Analyzer a = getTestAnalyzer();
|
Analyzer a = getTestAnalyzer();
|
||||||
|
|
||||||
assertAnalyzesToReuse(a, "AGD", new String[] { "artykuły gospodarstwa domowego" });
|
assertAnalyzesToReuse(a, "AGD", new String[] { "AGD", "artykuły gospodarstwa domowego" });
|
||||||
assertAnalyzesToReuse(a, "agd", new String[] { "artykuły gospodarstwa domowego" });
|
assertAnalyzesToReuse(a, "agd", new String[] { "artykuły gospodarstwa domowego" });
|
||||||
|
|
||||||
assertAnalyzesToReuse(a, "Poznania", new String[] { "Poznań" });
|
assertAnalyzesToReuse(a, "Poznania", new String[] { "Poznań" });
|
||||||
assertAnalyzesToReuse(a, "poznania", new String[] { "poznać" });
|
assertAnalyzesToReuse(a, "poznania", new String[] { "poznanie", "poznać" });
|
||||||
|
|
||||||
assertAnalyzesToReuse(a, "Aarona", new String[] { "Aaron" });
|
assertAnalyzesToReuse(a, "Aarona", new String[] { "Aaron" });
|
||||||
assertAnalyzesToReuse(a, "aarona", new String[] { "aarona" });
|
assertAnalyzesToReuse(a, "aarona", new String[] { "aarona" });
|
||||||
|
|
||||||
assertAnalyzesToReuse(a, "Liście", new String[] { "liść", "list", "lista" });
|
assertAnalyzesToReuse(a, "Liście", new String[] { "liście", "liść", "list", "lista" });
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertPOSToken(TokenStream ts, String term, String pos) throws IOException {
|
private void assertPOSToken(TokenStream ts, String term, String... tags) throws IOException {
|
||||||
ts.incrementToken();
|
ts.incrementToken();
|
||||||
assertEquals(term, ts.getAttribute(CharTermAttribute.class).toString());
|
assertEquals(term, ts.getAttribute(CharTermAttribute.class).toString());
|
||||||
assertEquals(pos, ts.getAttribute(MorphosyntacticTagAttribute.class).getTag().toString());
|
|
||||||
|
TreeSet<String> actual = new TreeSet<String>();
|
||||||
|
TreeSet<String> expected = new TreeSet<String>();
|
||||||
|
for (StringBuilder b : ts.getAttribute(MorphosyntacticTagsAttribute.class).getTags()) {
|
||||||
|
actual.add(b.toString());
|
||||||
|
}
|
||||||
|
for (String s : tags) {
|
||||||
|
expected.add(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!expected.equals(actual)) {
|
||||||
|
System.out.println("Expected:\n" + expected);
|
||||||
|
System.out.println("Actual:\n" + actual);
|
||||||
|
assertEquals(expected, actual);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Test morphosyntactic annotations. */
|
/** Test morphosyntactic annotations. */
|
||||||
public final void testPOSAttribute() throws IOException {
|
public final void testPOSAttribute() throws IOException {
|
||||||
TokenStream ts = getTestAnalyzer().tokenStream("dummy", new StringReader("liście"));
|
TokenStream ts = getTestAnalyzer().tokenStream("dummy", new StringReader("liście"));
|
||||||
|
|
||||||
assertPOSToken(ts, "liść", "subst:pl:acc.nom.voc:m3");
|
assertPOSToken(ts, "liście",
|
||||||
assertPOSToken(ts, "list", "subst:sg:loc.voc:m3");
|
"subst:sg:acc:n2",
|
||||||
assertPOSToken(ts, "lista", "subst:sg:dat.loc:f");
|
"subst:sg:nom:n2",
|
||||||
|
"subst:sg:voc:n2");
|
||||||
|
|
||||||
|
assertPOSToken(ts, "liść",
|
||||||
|
"subst:pl:acc:m3",
|
||||||
|
"subst:pl:nom:m3",
|
||||||
|
"subst:pl:voc:m3");
|
||||||
|
|
||||||
|
assertPOSToken(ts, "list",
|
||||||
|
"subst:sg:loc:m3",
|
||||||
|
"subst:sg:voc:m3");
|
||||||
|
|
||||||
|
assertPOSToken(ts, "lista",
|
||||||
|
"subst:sg:dat:f",
|
||||||
|
"subst:sg:loc:f");
|
||||||
}
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandom() throws Exception {
|
public void testRandom() throws Exception {
|
||||||
checkRandomData(random(), getTestAnalyzer(), 10000 * RANDOM_MULTIPLIER);
|
checkRandomData(random(), getTestAnalyzer(), 10000 * RANDOM_MULTIPLIER);
|
||||||
|
|
|
@ -312,9 +312,7 @@
|
||||||
<property name="analyzers-morfologik.jar" value="${common.dir}/build/analysis/morfologik/lucene-analyzers-morfologik-${version}.jar"/>
|
<property name="analyzers-morfologik.jar" value="${common.dir}/build/analysis/morfologik/lucene-analyzers-morfologik-${version}.jar"/>
|
||||||
<fileset id="analyzers-morfologik.fileset" dir="${common.dir}">
|
<fileset id="analyzers-morfologik.fileset" dir="${common.dir}">
|
||||||
<include name="build/analysis/morfologik/lucene-analyzers-morfologik-${version}.jar" />
|
<include name="build/analysis/morfologik/lucene-analyzers-morfologik-${version}.jar" />
|
||||||
<include name="analysis/morfologik/lib/morfologik-fsa-1.5.2.jar" />
|
<include name="analysis/morfologik/lib/morfologik-*.jar" />
|
||||||
<include name="analysis/morfologik/lib/morfologik-polish-1.5.2.jar" />
|
|
||||||
<include name="analysis/morfologik/lib/morfologik-stemming-1.5.2.jar" />
|
|
||||||
</fileset>
|
</fileset>
|
||||||
<target name="check-analyzers-morfologik-uptodate" unless="analyzers-morfologik.uptodate">
|
<target name="check-analyzers-morfologik-uptodate" unless="analyzers-morfologik.uptodate">
|
||||||
<module-uptodate name="analysis/morfologik" jarfile="${analyzers-morfologik.jar}" property="analyzers-morfologik.uptodate"/>
|
<module-uptodate name="analysis/morfologik" jarfile="${analyzers-morfologik.jar}" property="analyzers-morfologik.uptodate"/>
|
||||||
|
|
|
@ -7,7 +7,6 @@ import java.util.Map;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
import org.apache.solr.schema.IndexSchema;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
|
Loading…
Reference in New Issue