mirror of https://github.com/apache/lucene.git
LUCENE-2062: Bulgarian Analyzer
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@886190 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bdf44dce94
commit
892bc7f55a
|
@ -20,6 +20,11 @@ stopword list that is BSD-licensed created by Jacques Savoy. The file resides i
|
|||
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt.
|
||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
|
||||
The Bulgarian analyzer (contrib/analyzers) comes with a default
|
||||
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
|
||||
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt.
|
||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
|
||||
Includes lib/servlet-api-2.4.jar from Apache Tomcat
|
||||
|
||||
The SmartChineseAnalyzer source code (under contrib/analyzers) was
|
||||
|
|
|
@ -15,6 +15,8 @@ New features
|
|||
* LUCENE-2067: Add a Czech light stemmer. CzechAnalyzer will now stem words
|
||||
when Version is set to 3.1 or higher. (Robert Muir)
|
||||
|
||||
* LUCENE-2062: Add a Bulgarian analyzer. (Robert Muir, Simon Willnauer)
|
||||
|
||||
|
||||
======================= Release 3.0.0 2009-11-25 =======================
|
||||
|
||||
|
|
|
@ -0,0 +1,176 @@
|
|||
package org.apache.lucene.analysis.bg;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Bulgarian.
|
||||
* <p>
|
||||
* This analyzer implements light-stemming as specified by: <i> Searching
|
||||
* Strategies for the Bulgarian Language </i>
|
||||
* http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
|
||||
* <p>
|
||||
*/
|
||||
public final class BulgarianAnalyzer extends Analyzer {
|
||||
|
||||
/**
|
||||
* File containing default Bulgarian stopwords.
|
||||
*
|
||||
* Default stopword list is from
|
||||
* http://members.unine.ch/jacques.savoy/clef/index.html The stopword list is
|
||||
* BSD-Licensed.
|
||||
*/
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private final Set<?> stoptable;
|
||||
/**
|
||||
* The comment character in the stopwords file. All lines prefixed with this
|
||||
* will be ignored
|
||||
*/
|
||||
public static final String STOPWORDS_COMMENT = "#";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
*
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<String> getDefaultStopSet() {
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer
|
||||
* class accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<String> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = loadDefaultStopWordSet();
|
||||
} catch (Exception ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set", ex);
|
||||
}
|
||||
}
|
||||
|
||||
static Set<String> loadDefaultStopWordSet() throws IOException {
|
||||
final InputStream stream = BulgarianAnalyzer.class
|
||||
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
|
||||
try {
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
// make sure it is unmodifiable as we expose it in the outer class
|
||||
return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
|
||||
STOPWORDS_COMMENT));
|
||||
} finally {
|
||||
if(stream != null)
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words:
|
||||
* {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public BulgarianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public BulgarianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
super();
|
||||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
|
||||
stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStream} which tokenizes all the text in the provided
|
||||
* {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from an {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, and {@link BulgarianStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new StandardTokenizer(matchVersion, reader);
|
||||
result = new StandardFilter(result);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stoptable);
|
||||
result = new BulgarianStemFilter(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the
|
||||
* text in the provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from an {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, and {@link BulgarianStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(matchVersion, reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new LowerCaseFilter(matchVersion, streams.result);
|
||||
streams.result = new StopFilter(matchVersion, streams.result, stoptable);
|
||||
streams.result = new BulgarianStemFilter(streams.result);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
package org.apache.lucene.analysis.bg;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that applies {@link BulgarianStemmer} to stem Bulgarian
|
||||
* words.
|
||||
*/
|
||||
public final class BulgarianStemFilter extends TokenFilter {
|
||||
private final BulgarianStemmer stemmer;
|
||||
private final TermAttribute termAtt;
|
||||
|
||||
public BulgarianStemFilter(final TokenStream input) {
|
||||
super(input);
|
||||
stemmer = new BulgarianStemmer();
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
final int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
|
||||
termAtt.setTermLength(newlen);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,152 @@
|
|||
package org.apache.lucene.analysis.bg;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Light Stemmer for Bulgarian.
|
||||
* <p>
|
||||
* Implements the algorithm described in:
|
||||
* <i>
|
||||
* Searching Strategies for the Bulgarian Language
|
||||
* </i>
|
||||
* http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
|
||||
*/
|
||||
public class BulgarianStemmer {
|
||||
|
||||
/**
|
||||
* Stem an input buffer of Bulgarian text.
|
||||
*
|
||||
* @param s input buffer
|
||||
* @param len length of input buffer
|
||||
* @return length of input buffer after normalization
|
||||
*/
|
||||
public int stem(final char s[], int len) {
|
||||
if (len < 4) // do not stem
|
||||
return len;
|
||||
|
||||
if (len > 5 && endsWith(s, len, "ища"))
|
||||
return len - 3;
|
||||
|
||||
len = removeArticle(s, len);
|
||||
len = removePlural(s, len);
|
||||
|
||||
if (len > 3) {
|
||||
if (endsWith(s, len, "я"))
|
||||
len--;
|
||||
if (endsWith(s, len, "а") ||
|
||||
endsWith(s, len, "о") ||
|
||||
endsWith(s, len, "е"))
|
||||
len--;
|
||||
}
|
||||
|
||||
// the rule to rewrite ен -> н is duplicated in the paper.
|
||||
// in the perl implementation referenced by the paper, this is fixed.
|
||||
// (it is fixed here as well)
|
||||
if (len > 4 && endsWith(s, len, "ен")) {
|
||||
s[len - 2] = 'н'; // replace with н
|
||||
len--;
|
||||
}
|
||||
|
||||
if (len > 5 && s[len - 2] == 'ъ') {
|
||||
s[len - 2] = s[len - 1]; // replace ъN with N
|
||||
len--;
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Mainly remove the definite article
|
||||
* @param s input buffer
|
||||
* @param len length of input buffer
|
||||
* @return new stemmed length
|
||||
*/
|
||||
private int removeArticle(final char s[], final int len) {
|
||||
if (len > 6 && endsWith(s, len, "ият"))
|
||||
return len - 3;
|
||||
|
||||
if (len > 5) {
|
||||
if (endsWith(s, len, "ът") ||
|
||||
endsWith(s, len, "то") ||
|
||||
endsWith(s, len, "те") ||
|
||||
endsWith(s, len, "та") ||
|
||||
endsWith(s, len, "ия"))
|
||||
return len - 2;
|
||||
}
|
||||
|
||||
if (len > 4 && endsWith(s, len, "ят"))
|
||||
return len - 2;
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
private int removePlural(final char s[], final int len) {
|
||||
if (len > 6) {
|
||||
if (endsWith(s, len, "овци"))
|
||||
return len - 3; // replace with о
|
||||
if (endsWith(s, len, "ове"))
|
||||
return len - 3;
|
||||
if (endsWith(s, len, "еве")) {
|
||||
s[len - 3] = 'й'; // replace with й
|
||||
return len - 2;
|
||||
}
|
||||
}
|
||||
|
||||
if (len > 5) {
|
||||
if (endsWith(s, len, "ища"))
|
||||
return len - 3;
|
||||
if (endsWith(s, len, "та"))
|
||||
return len - 2;
|
||||
if (endsWith(s, len, "ци")) {
|
||||
s[len - 2] = 'к'; // replace with к
|
||||
return len - 1;
|
||||
}
|
||||
if (endsWith(s, len, "зи")) {
|
||||
s[len - 2] = 'г'; // replace with г
|
||||
return len - 1;
|
||||
}
|
||||
|
||||
if (s[len - 3] == 'е' && s[len - 1] == 'и') {
|
||||
s[len - 3] = 'я'; // replace е with я, remove и
|
||||
return len - 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (len > 4) {
|
||||
if (endsWith(s, len, "си")) {
|
||||
s[len - 2] = 'х'; // replace with х
|
||||
return len - 1;
|
||||
}
|
||||
if (endsWith(s, len, "и"))
|
||||
return len - 1;
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
private boolean endsWith(final char s[], final int len, final String suffix) {
|
||||
final int suffixLen = suffix.length();
|
||||
if (suffixLen > len)
|
||||
return false;
|
||||
for (int i = suffixLen - 1; i >= 0; i--)
|
||||
if (s[len -(suffixLen - i)] != suffix.charAt(i))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Analyzer for Bulgarian.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,193 @@
|
|||
# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
а
|
||||
аз
|
||||
ако
|
||||
ала
|
||||
бе
|
||||
без
|
||||
беше
|
||||
би
|
||||
бил
|
||||
била
|
||||
били
|
||||
било
|
||||
близо
|
||||
бъдат
|
||||
бъде
|
||||
бяха
|
||||
в
|
||||
вас
|
||||
ваш
|
||||
ваша
|
||||
вероятно
|
||||
вече
|
||||
взема
|
||||
ви
|
||||
вие
|
||||
винаги
|
||||
все
|
||||
всеки
|
||||
всички
|
||||
всичко
|
||||
всяка
|
||||
във
|
||||
въпреки
|
||||
върху
|
||||
г
|
||||
ги
|
||||
главно
|
||||
го
|
||||
д
|
||||
да
|
||||
дали
|
||||
до
|
||||
докато
|
||||
докога
|
||||
дори
|
||||
досега
|
||||
доста
|
||||
е
|
||||
едва
|
||||
един
|
||||
ето
|
||||
за
|
||||
зад
|
||||
заедно
|
||||
заради
|
||||
засега
|
||||
затова
|
||||
защо
|
||||
защото
|
||||
и
|
||||
из
|
||||
или
|
||||
им
|
||||
има
|
||||
имат
|
||||
иска
|
||||
й
|
||||
каза
|
||||
как
|
||||
каква
|
||||
какво
|
||||
както
|
||||
какъв
|
||||
като
|
||||
кога
|
||||
когато
|
||||
което
|
||||
които
|
||||
кой
|
||||
който
|
||||
колко
|
||||
която
|
||||
къде
|
||||
където
|
||||
към
|
||||
ли
|
||||
м
|
||||
ме
|
||||
между
|
||||
мен
|
||||
ми
|
||||
мнозина
|
||||
мога
|
||||
могат
|
||||
може
|
||||
моля
|
||||
момента
|
||||
му
|
||||
н
|
||||
на
|
||||
над
|
||||
назад
|
||||
най
|
||||
направи
|
||||
напред
|
||||
например
|
||||
нас
|
||||
не
|
||||
него
|
||||
нея
|
||||
ни
|
||||
ние
|
||||
никой
|
||||
нито
|
||||
но
|
||||
някои
|
||||
някой
|
||||
няма
|
||||
обаче
|
||||
около
|
||||
освен
|
||||
особено
|
||||
от
|
||||
отгоре
|
||||
отново
|
||||
още
|
||||
пак
|
||||
по
|
||||
повече
|
||||
повечето
|
||||
под
|
||||
поне
|
||||
поради
|
||||
после
|
||||
почти
|
||||
прави
|
||||
пред
|
||||
преди
|
||||
през
|
||||
при
|
||||
пък
|
||||
първо
|
||||
с
|
||||
са
|
||||
само
|
||||
се
|
||||
сега
|
||||
си
|
||||
скоро
|
||||
след
|
||||
сме
|
||||
според
|
||||
сред
|
||||
срещу
|
||||
сте
|
||||
съм
|
||||
със
|
||||
също
|
||||
т
|
||||
тази
|
||||
така
|
||||
такива
|
||||
такъв
|
||||
там
|
||||
твой
|
||||
те
|
||||
тези
|
||||
ти
|
||||
тн
|
||||
то
|
||||
това
|
||||
тогава
|
||||
този
|
||||
той
|
||||
толкова
|
||||
точно
|
||||
трябва
|
||||
тук
|
||||
тъй
|
||||
тя
|
||||
тях
|
||||
у
|
||||
харесва
|
||||
ч
|
||||
че
|
||||
често
|
||||
чрез
|
||||
ще
|
||||
щом
|
||||
я
|
|
@ -0,0 +1,70 @@
|
|||
package org.apache.lucene.analysis.bg;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Test the Bulgarian analyzer
|
||||
*/
|
||||
public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
||||
/**
|
||||
* This test fails with NPE when the stopwords file is missing in classpath
|
||||
*/
|
||||
public void testResourcesAvailable() {
|
||||
new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
||||
}
|
||||
|
||||
public void testStopwords() throws IOException {
|
||||
Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
||||
assertAnalyzesTo(a, "Как се казваш?", new String[] {"казваш"});
|
||||
}
|
||||
|
||||
public void testCustomStopwords() throws IOException {
|
||||
Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT, Collections
|
||||
.emptySet());
|
||||
assertAnalyzesTo(a, "Как се казваш?",
|
||||
new String[] {"как", "се", "казваш"});
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws IOException {
|
||||
Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
||||
assertAnalyzesToReuse(a, "документи", new String[] {"документ"});
|
||||
assertAnalyzesToReuse(a, "документ", new String[] {"документ"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test some examples from the paper
|
||||
*/
|
||||
public void testBasicExamples() throws IOException {
|
||||
Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
||||
assertAnalyzesTo(a, "енергийни кризи", new String[] {"енергийн", "криз"});
|
||||
assertAnalyzesTo(a, "Атомната енергия", new String[] {"атомн", "енерг"});
|
||||
|
||||
assertAnalyzesTo(a, "компютри", new String[] {"компютр"});
|
||||
assertAnalyzesTo(a, "компютър", new String[] {"компютр"});
|
||||
|
||||
assertAnalyzesTo(a, "градове", new String[] {"град"});
|
||||
}
|
||||
}
|
|
@ -0,0 +1,210 @@
|
|||
package org.apache.lucene.analysis.bg;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Test the Bulgarian Stemmer
|
||||
*/
|
||||
public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
|
||||
/**
|
||||
* Test showing how masculine noun forms conflate. An example noun for each
|
||||
* common (and some rare) plural pattern is listed.
|
||||
*/
|
||||
public void testMasculineNouns() throws IOException {
|
||||
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
||||
|
||||
// -и pattern
|
||||
assertAnalyzesTo(a, "град", new String[] {"град"});
|
||||
assertAnalyzesTo(a, "града", new String[] {"град"});
|
||||
assertAnalyzesTo(a, "градът", new String[] {"град"});
|
||||
assertAnalyzesTo(a, "градове", new String[] {"град"});
|
||||
assertAnalyzesTo(a, "градовете", new String[] {"град"});
|
||||
|
||||
// -ове pattern
|
||||
assertAnalyzesTo(a, "народ", new String[] {"народ"});
|
||||
assertAnalyzesTo(a, "народа", new String[] {"народ"});
|
||||
assertAnalyzesTo(a, "народът", new String[] {"народ"});
|
||||
assertAnalyzesTo(a, "народи", new String[] {"народ"});
|
||||
assertAnalyzesTo(a, "народите", new String[] {"народ"});
|
||||
assertAnalyzesTo(a, "народе", new String[] {"народ"});
|
||||
|
||||
// -ища pattern
|
||||
assertAnalyzesTo(a, "път", new String[] {"път"});
|
||||
assertAnalyzesTo(a, "пътя", new String[] {"път"});
|
||||
assertAnalyzesTo(a, "пътят", new String[] {"път"});
|
||||
assertAnalyzesTo(a, "пътища", new String[] {"път"});
|
||||
assertAnalyzesTo(a, "пътищата", new String[] {"път"});
|
||||
|
||||
// -чета pattern
|
||||
assertAnalyzesTo(a, "градец", new String[] {"градец"});
|
||||
assertAnalyzesTo(a, "градеца", new String[] {"градец"});
|
||||
assertAnalyzesTo(a, "градецът", new String[] {"градец"});
|
||||
/* note the below forms conflate with each other, but not the rest */
|
||||
assertAnalyzesTo(a, "градовце", new String[] {"градовц"});
|
||||
assertAnalyzesTo(a, "градовцете", new String[] {"градовц"});
|
||||
|
||||
// -овци pattern
|
||||
assertAnalyzesTo(a, "дядо", new String[] {"дяд"});
|
||||
assertAnalyzesTo(a, "дядото", new String[] {"дяд"});
|
||||
assertAnalyzesTo(a, "дядовци", new String[] {"дяд"});
|
||||
assertAnalyzesTo(a, "дядовците", new String[] {"дяд"});
|
||||
|
||||
// -е pattern
|
||||
assertAnalyzesTo(a, "мъж", new String[] {"мъж"});
|
||||
assertAnalyzesTo(a, "мъжа", new String[] {"мъж"});
|
||||
assertAnalyzesTo(a, "мъже", new String[] {"мъж"});
|
||||
assertAnalyzesTo(a, "мъжете", new String[] {"мъж"});
|
||||
assertAnalyzesTo(a, "мъжо", new String[] {"мъж"});
|
||||
/* word is too short, will not remove -ът */
|
||||
assertAnalyzesTo(a, "мъжът", new String[] {"мъжът"});
|
||||
|
||||
// -а pattern
|
||||
assertAnalyzesTo(a, "крак", new String[] {"крак"});
|
||||
assertAnalyzesTo(a, "крака", new String[] {"крак"});
|
||||
assertAnalyzesTo(a, "кракът", new String[] {"крак"});
|
||||
assertAnalyzesTo(a, "краката", new String[] {"крак"});
|
||||
|
||||
// брат
|
||||
assertAnalyzesTo(a, "брат", new String[] {"брат"});
|
||||
assertAnalyzesTo(a, "брата", new String[] {"брат"});
|
||||
assertAnalyzesTo(a, "братът", new String[] {"брат"});
|
||||
assertAnalyzesTo(a, "братя", new String[] {"брат"});
|
||||
assertAnalyzesTo(a, "братята", new String[] {"брат"});
|
||||
assertAnalyzesTo(a, "брате", new String[] {"брат"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test showing how feminine noun forms conflate
|
||||
*/
|
||||
public void testFeminineNouns() throws IOException {
|
||||
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
||||
|
||||
assertAnalyzesTo(a, "вест", new String[] {"вест"});
|
||||
assertAnalyzesTo(a, "вестта", new String[] {"вест"});
|
||||
assertAnalyzesTo(a, "вести", new String[] {"вест"});
|
||||
assertAnalyzesTo(a, "вестите", new String[] {"вест"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test showing how neuter noun forms conflate an example noun for each common
|
||||
* plural pattern is listed
|
||||
*/
|
||||
public void testNeuterNouns() throws IOException {
|
||||
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
||||
|
||||
// -а pattern
|
||||
assertAnalyzesTo(a, "дърво", new String[] {"дърв"});
|
||||
assertAnalyzesTo(a, "дървото", new String[] {"дърв"});
|
||||
assertAnalyzesTo(a, "дърва", new String[] {"дърв"});
|
||||
assertAnalyzesTo(a, "дървета", new String[] {"дърв"});
|
||||
assertAnalyzesTo(a, "дървата", new String[] {"дърв"});
|
||||
assertAnalyzesTo(a, "дърветата", new String[] {"дърв"});
|
||||
|
||||
// -та pattern
|
||||
assertAnalyzesTo(a, "море", new String[] {"мор"});
|
||||
assertAnalyzesTo(a, "морето", new String[] {"мор"});
|
||||
assertAnalyzesTo(a, "морета", new String[] {"мор"});
|
||||
assertAnalyzesTo(a, "моретата", new String[] {"мор"});
|
||||
|
||||
// -я pattern
|
||||
assertAnalyzesTo(a, "изключение", new String[] {"изключени"});
|
||||
assertAnalyzesTo(a, "изключението", new String[] {"изключени"});
|
||||
assertAnalyzesTo(a, "изключенията", new String[] {"изключени"});
|
||||
/* note the below form in this example does not conflate with the rest */
|
||||
assertAnalyzesTo(a, "изключения", new String[] {"изключн"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test showing how adjectival forms conflate
|
||||
*/
|
||||
public void testAdjectives() throws IOException {
|
||||
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
||||
assertAnalyzesTo(a, "красив", new String[] {"красив"});
|
||||
assertAnalyzesTo(a, "красивия", new String[] {"красив"});
|
||||
assertAnalyzesTo(a, "красивият", new String[] {"красив"});
|
||||
assertAnalyzesTo(a, "красива", new String[] {"красив"});
|
||||
assertAnalyzesTo(a, "красивата", new String[] {"красив"});
|
||||
assertAnalyzesTo(a, "красиво", new String[] {"красив"});
|
||||
assertAnalyzesTo(a, "красивото", new String[] {"красив"});
|
||||
assertAnalyzesTo(a, "красиви", new String[] {"красив"});
|
||||
assertAnalyzesTo(a, "красивите", new String[] {"красив"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test some exceptional rules, implemented as rewrites.
|
||||
*/
|
||||
public void testExceptions() throws IOException {
|
||||
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
||||
|
||||
// ци -> к
|
||||
assertAnalyzesTo(a, "собственик", new String[] {"собственик"});
|
||||
assertAnalyzesTo(a, "собственика", new String[] {"собственик"});
|
||||
assertAnalyzesTo(a, "собственикът", new String[] {"собственик"});
|
||||
assertAnalyzesTo(a, "собственици", new String[] {"собственик"});
|
||||
assertAnalyzesTo(a, "собствениците", new String[] {"собственик"});
|
||||
|
||||
// зи -> г
|
||||
assertAnalyzesTo(a, "подлог", new String[] {"подлог"});
|
||||
assertAnalyzesTo(a, "подлога", new String[] {"подлог"});
|
||||
assertAnalyzesTo(a, "подлогът", new String[] {"подлог"});
|
||||
assertAnalyzesTo(a, "подлози", new String[] {"подлог"});
|
||||
assertAnalyzesTo(a, "подлозите", new String[] {"подлог"});
|
||||
|
||||
// си -> х
|
||||
assertAnalyzesTo(a, "кожух", new String[] {"кожух"});
|
||||
assertAnalyzesTo(a, "кожуха", new String[] {"кожух"});
|
||||
assertAnalyzesTo(a, "кожухът", new String[] {"кожух"});
|
||||
assertAnalyzesTo(a, "кожуси", new String[] {"кожух"});
|
||||
assertAnalyzesTo(a, "кожусите", new String[] {"кожух"});
|
||||
|
||||
// ъ deletion
|
||||
assertAnalyzesTo(a, "център", new String[] {"центр"});
|
||||
assertAnalyzesTo(a, "центъра", new String[] {"центр"});
|
||||
assertAnalyzesTo(a, "центърът", new String[] {"центр"});
|
||||
assertAnalyzesTo(a, "центрове", new String[] {"центр"});
|
||||
assertAnalyzesTo(a, "центровете", new String[] {"центр"});
|
||||
|
||||
// е*и -> я*
|
||||
assertAnalyzesTo(a, "промяна", new String[] {"промян"});
|
||||
assertAnalyzesTo(a, "промяната", new String[] {"промян"});
|
||||
assertAnalyzesTo(a, "промени", new String[] {"промян"});
|
||||
assertAnalyzesTo(a, "промените", new String[] {"промян"});
|
||||
|
||||
// ен -> н
|
||||
assertAnalyzesTo(a, "песен", new String[] {"песн"});
|
||||
assertAnalyzesTo(a, "песента", new String[] {"песн"});
|
||||
assertAnalyzesTo(a, "песни", new String[] {"песн"});
|
||||
assertAnalyzesTo(a, "песните", new String[] {"песн"});
|
||||
|
||||
// -еве -> й
|
||||
// note: this is the only word i think this rule works for.
|
||||
// most -еве pluralized nouns are monosyllabic,
|
||||
// and the stemmer requires length > 6...
|
||||
assertAnalyzesTo(a, "строй", new String[] {"строй"});
|
||||
assertAnalyzesTo(a, "строеве", new String[] {"строй"});
|
||||
assertAnalyzesTo(a, "строевете", new String[] {"строй"});
|
||||
/* note the below forms conflate with each other, but not the rest */
|
||||
assertAnalyzesTo(a, "строя", new String[] {"стр"});
|
||||
assertAnalyzesTo(a, "строят", new String[] {"стр"});
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue