LUCENE-2062: Bulgarian Analyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@886190 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2009-12-02 16:08:56 +00:00
parent bdf44dce94
commit 892bc7f55a
9 changed files with 880 additions and 0 deletions

View File

@ -20,6 +20,11 @@ stopword list that is BSD-licensed created by Jacques Savoy. The file resides i
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt. contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt.
See http://members.unine.ch/jacques.savoy/clef/index.html. See http://members.unine.ch/jacques.savoy/clef/index.html.
The Bulgarian analyzer (contrib/analyzers) comes with a default
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt.
See http://members.unine.ch/jacques.savoy/clef/index.html.
Includes lib/servlet-api-2.4.jar from Apache Tomcat Includes lib/servlet-api-2.4.jar from Apache Tomcat
The SmartChineseAnalyzer source code (under contrib/analyzers) was The SmartChineseAnalyzer source code (under contrib/analyzers) was

View File

@ -15,6 +15,8 @@ New features
* LUCENE-2067: Add a Czech light stemmer. CzechAnalyzer will now stem words * LUCENE-2067: Add a Czech light stemmer. CzechAnalyzer will now stem words
when Version is set to 3.1 or higher. (Robert Muir) when Version is set to 3.1 or higher. (Robert Muir)
* LUCENE-2062: Add a Bulgarian analyzer. (Robert Muir, Simon Willnauer)
======================= Release 3.0.0 2009-11-25 ======================= ======================= Release 3.0.0 2009-11-25 =======================

View File

@ -0,0 +1,176 @@
package org.apache.lucene.analysis.bg;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Collections;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
/**
* {@link Analyzer} for Bulgarian.
* <p>
* This analyzer implements light-stemming as specified by: <i> Searching
* Strategies for the Bulgarian Language </i>
* http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
* <p>
*/
public final class BulgarianAnalyzer extends Analyzer {
/**
* File containing default Bulgarian stopwords.
*
* Default stopword list is from
* http://members.unine.ch/jacques.savoy/clef/index.html The stopword list is
* BSD-Licensed.
*/
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
* Contains the stopwords used with the StopFilter.
*/
private final Set<?> stoptable;
/**
* The comment character in the stopwords file. All lines prefixed with this
* will be ignored
*/
public static final String STOPWORDS_COMMENT = "#";
/**
* Returns an unmodifiable instance of the default stop-words set.
*
* @return an unmodifiable instance of the default stop-words set.
*/
public static Set<String> getDefaultStopSet() {
return DefaultSetHolder.DEFAULT_STOP_SET;
}
/**
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer
* class accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
static final Set<String> DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET = loadDefaultStopWordSet();
} catch (Exception ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set", ex);
}
}
static Set<String> loadDefaultStopWordSet() throws IOException {
final InputStream stream = BulgarianAnalyzer.class
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
try {
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
// make sure it is unmodifiable as we expose it in the outer class
return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
STOPWORDS_COMMENT));
} finally {
if(stream != null)
stream.close();
}
}
}
private final Version matchVersion;
/**
* Builds an analyzer with the default stop words:
* {@link #DEFAULT_STOPWORD_FILE}.
*/
public BulgarianAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Builds an analyzer with the given stop words.
*/
public BulgarianAnalyzer(Version matchVersion, Set<?> stopwords) {
super();
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
stopwords));
this.matchVersion = matchVersion;
}
/**
* Creates a {@link TokenStream} which tokenizes all the text in the provided
* {@link Reader}.
*
* @return A {@link TokenStream} built from an {@link StandardTokenizer}
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
* {@link StopFilter}, and {@link BulgarianStemFilter}.
*/
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(matchVersion, reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stoptable);
result = new BulgarianStemFilter(result);
return result;
}
private class SavedStreams {
Tokenizer source;
TokenStream result;
};
/**
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the
* text in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from an {@link StandardTokenizer}
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
* {@link StopFilter}, and {@link BulgarianStemFilter}.
*/
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new StandardFilter(streams.source);
streams.result = new LowerCaseFilter(matchVersion, streams.result);
streams.result = new StopFilter(matchVersion, streams.result, stoptable);
streams.result = new BulgarianStemFilter(streams.result);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
}
return streams.result;
}
}

View File

@ -0,0 +1,50 @@
package org.apache.lucene.analysis.bg;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* A {@link TokenFilter} that applies {@link BulgarianStemmer} to stem Bulgarian
* words.
*/
public final class BulgarianStemFilter extends TokenFilter {
private final BulgarianStemmer stemmer;
private final TermAttribute termAtt;
public BulgarianStemFilter(final TokenStream input) {
super(input);
stemmer = new BulgarianStemmer();
termAtt = addAttribute(TermAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
final int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
termAtt.setTermLength(newlen);
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,152 @@
package org.apache.lucene.analysis.bg;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Light Stemmer for Bulgarian.
* <p>
* Implements the algorithm described in:
* <i>
* Searching Strategies for the Bulgarian Language
* </i>
* http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
*/
public class BulgarianStemmer {
/**
* Stem an input buffer of Bulgarian text.
*
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
*/
public int stem(final char s[], int len) {
if (len < 4) // do not stem
return len;
if (len > 5 && endsWith(s, len, "ища"))
return len - 3;
len = removeArticle(s, len);
len = removePlural(s, len);
if (len > 3) {
if (endsWith(s, len, "я"))
len--;
if (endsWith(s, len, "а") ||
endsWith(s, len, "о") ||
endsWith(s, len, "е"))
len--;
}
// the rule to rewrite ен -> н is duplicated in the paper.
// in the perl implementation referenced by the paper, this is fixed.
// (it is fixed here as well)
if (len > 4 && endsWith(s, len, "ен")) {
s[len - 2] = 'н'; // replace with н
len--;
}
if (len > 5 && s[len - 2] == 'ъ') {
s[len - 2] = s[len - 1]; // replace ъN with N
len--;
}
return len;
}
/**
* Mainly remove the definite article
* @param s input buffer
* @param len length of input buffer
* @return new stemmed length
*/
private int removeArticle(final char s[], final int len) {
if (len > 6 && endsWith(s, len, "ият"))
return len - 3;
if (len > 5) {
if (endsWith(s, len, "ът") ||
endsWith(s, len, "то") ||
endsWith(s, len, "те") ||
endsWith(s, len, "та") ||
endsWith(s, len, "ия"))
return len - 2;
}
if (len > 4 && endsWith(s, len, "ят"))
return len - 2;
return len;
}
private int removePlural(final char s[], final int len) {
if (len > 6) {
if (endsWith(s, len, "овци"))
return len - 3; // replace with о
if (endsWith(s, len, "ове"))
return len - 3;
if (endsWith(s, len, "еве")) {
s[len - 3] = 'й'; // replace with й
return len - 2;
}
}
if (len > 5) {
if (endsWith(s, len, "ища"))
return len - 3;
if (endsWith(s, len, "та"))
return len - 2;
if (endsWith(s, len, "ци")) {
s[len - 2] = 'к'; // replace with к
return len - 1;
}
if (endsWith(s, len, "зи")) {
s[len - 2] = 'г'; // replace with г
return len - 1;
}
if (s[len - 3] == 'е' && s[len - 1] == 'и') {
s[len - 3] = 'я'; // replace е with я, remove и
return len - 1;
}
}
if (len > 4) {
if (endsWith(s, len, "си")) {
s[len - 2] = 'х'; // replace with х
return len - 1;
}
if (endsWith(s, len, "и"))
return len - 1;
}
return len;
}
private boolean endsWith(final char s[], final int len, final String suffix) {
final int suffixLen = suffix.length();
if (suffixLen > len)
return false;
for (int i = suffixLen - 1; i >= 0; i--)
if (s[len -(suffixLen - i)] != suffix.charAt(i))
return false;
return true;
}
}

View File

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Analyzer for Bulgarian.
</body>
</html>

View File

@ -0,0 +1,193 @@
# This file was created by Jacques Savoy and is distributed under the BSD license.
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# Also see http://www.opensource.org/licenses/bsd-license.html
а
аз
ако
ала
бе
без
беше
би
бил
била
били
било
близо
бъдат
бъде
бяха
в
вас
ваш
ваша
вероятно
вече
взема
ви
вие
винаги
все
всеки
всички
всичко
всяка
във
въпреки
върху
г
ги
главно
го
д
да
дали
до
докато
докога
дори
досега
доста
е
едва
един
ето
за
зад
заедно
заради
засега
затова
защо
защото
и
из
или
им
има
имат
иска
й
каза
как
каква
какво
както
какъв
като
кога
когато
което
които
кой
който
колко
която
къде
където
към
ли
м
ме
между
мен
ми
мнозина
мога
могат
може
моля
момента
му
н
на
над
назад
най
направи
напред
например
нас
не
него
нея
ни
ние
никой
нито
но
някои
някой
няма
обаче
около
освен
особено
от
отгоре
отново
още
пак
по
повече
повечето
под
поне
поради
после
почти
прави
пред
преди
през
при
пък
първо
с
са
само
се
сега
си
скоро
след
сме
според
сред
срещу
сте
съм
със
също
т
тази
така
такива
такъв
там
твой
те
тези
ти
тн
то
това
тогава
този
той
толкова
точно
трябва
тук
тъй
тя
тях
у
харесва
ч
че
често
чрез
ще
щом
я

View File

@ -0,0 +1,70 @@
package org.apache.lucene.analysis.bg;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Collections;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.Version;
/**
* Test the Bulgarian analyzer
*/
public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
/**
* This test fails with NPE when the stopwords file is missing in classpath
*/
public void testResourcesAvailable() {
new BulgarianAnalyzer(Version.LUCENE_CURRENT);
}
public void testStopwords() throws IOException {
Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
assertAnalyzesTo(a, "Как се казваш?", new String[] {"казваш"});
}
public void testCustomStopwords() throws IOException {
Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT, Collections
.emptySet());
assertAnalyzesTo(a, "Как се казваш?",
new String[] {"как", "се", "казваш"});
}
public void testReusableTokenStream() throws IOException {
Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
assertAnalyzesToReuse(a, "документи", new String[] {"документ"});
assertAnalyzesToReuse(a, "документ", new String[] {"документ"});
}
/**
* Test some examples from the paper
*/
public void testBasicExamples() throws IOException {
Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
assertAnalyzesTo(a, "енергийни кризи", new String[] {"енергийн", "криз"});
assertAnalyzesTo(a, "Атомната енергия", new String[] {"атомн", "енерг"});
assertAnalyzesTo(a, "компютри", new String[] {"компютр"});
assertAnalyzesTo(a, "компютър", new String[] {"компютр"});
assertAnalyzesTo(a, "градове", new String[] {"град"});
}
}

View File

@ -0,0 +1,210 @@
package org.apache.lucene.analysis.bg;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.Version;
/**
* Test the Bulgarian Stemmer
*/
public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
/**
* Test showing how masculine noun forms conflate. An example noun for each
* common (and some rare) plural pattern is listed.
*/
public void testMasculineNouns() throws IOException {
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
// -и pattern
assertAnalyzesTo(a, "град", new String[] {"град"});
assertAnalyzesTo(a, "града", new String[] {"град"});
assertAnalyzesTo(a, "градът", new String[] {"град"});
assertAnalyzesTo(a, "градове", new String[] {"град"});
assertAnalyzesTo(a, "градовете", new String[] {"град"});
// -ове pattern
assertAnalyzesTo(a, "народ", new String[] {"народ"});
assertAnalyzesTo(a, "народа", new String[] {"народ"});
assertAnalyzesTo(a, "народът", new String[] {"народ"});
assertAnalyzesTo(a, "народи", new String[] {"народ"});
assertAnalyzesTo(a, "народите", new String[] {"народ"});
assertAnalyzesTo(a, "народе", new String[] {"народ"});
// -ища pattern
assertAnalyzesTo(a, "път", new String[] {"път"});
assertAnalyzesTo(a, "пътя", new String[] {"път"});
assertAnalyzesTo(a, "пътят", new String[] {"път"});
assertAnalyzesTo(a, "пътища", new String[] {"път"});
assertAnalyzesTo(a, "пътищата", new String[] {"път"});
// -чета pattern
assertAnalyzesTo(a, "градец", new String[] {"градец"});
assertAnalyzesTo(a, "градеца", new String[] {"градец"});
assertAnalyzesTo(a, "градецът", new String[] {"градец"});
/* note the below forms conflate with each other, but not the rest */
assertAnalyzesTo(a, "градовце", new String[] {"градовц"});
assertAnalyzesTo(a, "градовцете", new String[] {"градовц"});
// -овци pattern
assertAnalyzesTo(a, "дядо", new String[] {"дяд"});
assertAnalyzesTo(a, "дядото", new String[] {"дяд"});
assertAnalyzesTo(a, "дядовци", new String[] {"дяд"});
assertAnalyzesTo(a, "дядовците", new String[] {"дяд"});
// -е pattern
assertAnalyzesTo(a, "мъж", new String[] {"мъж"});
assertAnalyzesTo(a, "мъжа", new String[] {"мъж"});
assertAnalyzesTo(a, "мъже", new String[] {"мъж"});
assertAnalyzesTo(a, "мъжете", new String[] {"мъж"});
assertAnalyzesTo(a, "мъжо", new String[] {"мъж"});
/* word is too short, will not remove -ът */
assertAnalyzesTo(a, "мъжът", new String[] {"мъжът"});
// -а pattern
assertAnalyzesTo(a, "крак", new String[] {"крак"});
assertAnalyzesTo(a, "крака", new String[] {"крак"});
assertAnalyzesTo(a, "кракът", new String[] {"крак"});
assertAnalyzesTo(a, "краката", new String[] {"крак"});
// брат
assertAnalyzesTo(a, "брат", new String[] {"брат"});
assertAnalyzesTo(a, "брата", new String[] {"брат"});
assertAnalyzesTo(a, "братът", new String[] {"брат"});
assertAnalyzesTo(a, "братя", new String[] {"брат"});
assertAnalyzesTo(a, "братята", new String[] {"брат"});
assertAnalyzesTo(a, "брате", new String[] {"брат"});
}
/**
* Test showing how feminine noun forms conflate
*/
public void testFeminineNouns() throws IOException {
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
assertAnalyzesTo(a, "вест", new String[] {"вест"});
assertAnalyzesTo(a, "вестта", new String[] {"вест"});
assertAnalyzesTo(a, "вести", new String[] {"вест"});
assertAnalyzesTo(a, "вестите", new String[] {"вест"});
}
/**
* Test showing how neuter noun forms conflate an example noun for each common
* plural pattern is listed
*/
public void testNeuterNouns() throws IOException {
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
// -а pattern
assertAnalyzesTo(a, "дърво", new String[] {"дърв"});
assertAnalyzesTo(a, "дървото", new String[] {"дърв"});
assertAnalyzesTo(a, "дърва", new String[] {"дърв"});
assertAnalyzesTo(a, "дървета", new String[] {"дърв"});
assertAnalyzesTo(a, "дървата", new String[] {"дърв"});
assertAnalyzesTo(a, "дърветата", new String[] {"дърв"});
// -та pattern
assertAnalyzesTo(a, "море", new String[] {"мор"});
assertAnalyzesTo(a, "морето", new String[] {"мор"});
assertAnalyzesTo(a, "морета", new String[] {"мор"});
assertAnalyzesTo(a, "моретата", new String[] {"мор"});
// -я pattern
assertAnalyzesTo(a, "изключение", new String[] {"изключени"});
assertAnalyzesTo(a, "изключението", new String[] {"изключени"});
assertAnalyzesTo(a, "изключенията", new String[] {"изключени"});
/* note the below form in this example does not conflate with the rest */
assertAnalyzesTo(a, "изключения", new String[] {"изключн"});
}
/**
* Test showing how adjectival forms conflate
*/
public void testAdjectives() throws IOException {
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
assertAnalyzesTo(a, "красив", new String[] {"красив"});
assertAnalyzesTo(a, "красивия", new String[] {"красив"});
assertAnalyzesTo(a, "красивият", new String[] {"красив"});
assertAnalyzesTo(a, "красива", new String[] {"красив"});
assertAnalyzesTo(a, "красивата", new String[] {"красив"});
assertAnalyzesTo(a, "красиво", new String[] {"красив"});
assertAnalyzesTo(a, "красивото", new String[] {"красив"});
assertAnalyzesTo(a, "красиви", new String[] {"красив"});
assertAnalyzesTo(a, "красивите", new String[] {"красив"});
}
/**
* Test some exceptional rules, implemented as rewrites.
*/
public void testExceptions() throws IOException {
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
// ци -> к
assertAnalyzesTo(a, "собственик", new String[] {"собственик"});
assertAnalyzesTo(a, "собственика", new String[] {"собственик"});
assertAnalyzesTo(a, "собственикът", new String[] {"собственик"});
assertAnalyzesTo(a, "собственици", new String[] {"собственик"});
assertAnalyzesTo(a, "собствениците", new String[] {"собственик"});
// зи -> г
assertAnalyzesTo(a, "подлог", new String[] {"подлог"});
assertAnalyzesTo(a, "подлога", new String[] {"подлог"});
assertAnalyzesTo(a, "подлогът", new String[] {"подлог"});
assertAnalyzesTo(a, "подлози", new String[] {"подлог"});
assertAnalyzesTo(a, "подлозите", new String[] {"подлог"});
// си -> х
assertAnalyzesTo(a, "кожух", new String[] {"кожух"});
assertAnalyzesTo(a, "кожуха", new String[] {"кожух"});
assertAnalyzesTo(a, "кожухът", new String[] {"кожух"});
assertAnalyzesTo(a, "кожуси", new String[] {"кожух"});
assertAnalyzesTo(a, "кожусите", new String[] {"кожух"});
// ъ deletion
assertAnalyzesTo(a, "център", new String[] {"центр"});
assertAnalyzesTo(a, "центъра", new String[] {"центр"});
assertAnalyzesTo(a, "центърът", new String[] {"центр"});
assertAnalyzesTo(a, "центрове", new String[] {"центр"});
assertAnalyzesTo(a, "центровете", new String[] {"центр"});
// е*и -> я*
assertAnalyzesTo(a, "промяна", new String[] {"промян"});
assertAnalyzesTo(a, "промяната", new String[] {"промян"});
assertAnalyzesTo(a, "промени", new String[] {"промян"});
assertAnalyzesTo(a, "промените", new String[] {"промян"});
// ен -> н
assertAnalyzesTo(a, "песен", new String[] {"песн"});
assertAnalyzesTo(a, "песента", new String[] {"песн"});
assertAnalyzesTo(a, "песни", new String[] {"песн"});
assertAnalyzesTo(a, "песните", new String[] {"песн"});
// -еве -> й
// note: this is the only word i think this rule works for.
// most -еве pluralized nouns are monosyllabic,
// and the stemmer requires length > 6...
assertAnalyzesTo(a, "строй", new String[] {"строй"});
assertAnalyzesTo(a, "строеве", new String[] {"строй"});
assertAnalyzesTo(a, "строевете", new String[] {"строй"});
/* note the below forms conflate with each other, but not the rest */
assertAnalyzesTo(a, "строя", new String[] {"стр"});
assertAnalyzesTo(a, "строят", new String[] {"стр"});
}
}