mirror of https://github.com/apache/lucene.git
LUCENE-2062: Bulgarian Analyzer
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@886190 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bdf44dce94
commit
892bc7f55a
|
@ -20,6 +20,11 @@ stopword list that is BSD-licensed created by Jacques Savoy. The file resides i
|
||||||
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt.
|
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt.
|
||||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
|
||||||
|
The Bulgarian analyzer (contrib/analyzers) comes with a default
|
||||||
|
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
|
||||||
|
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt.
|
||||||
|
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
|
||||||
Includes lib/servlet-api-2.4.jar from Apache Tomcat
|
Includes lib/servlet-api-2.4.jar from Apache Tomcat
|
||||||
|
|
||||||
The SmartChineseAnalyzer source code (under contrib/analyzers) was
|
The SmartChineseAnalyzer source code (under contrib/analyzers) was
|
||||||
|
|
|
@ -15,6 +15,8 @@ New features
|
||||||
* LUCENE-2067: Add a Czech light stemmer. CzechAnalyzer will now stem words
|
* LUCENE-2067: Add a Czech light stemmer. CzechAnalyzer will now stem words
|
||||||
when Version is set to 3.1 or higher. (Robert Muir)
|
when Version is set to 3.1 or higher. (Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-2062: Add a Bulgarian analyzer. (Robert Muir, Simon Willnauer)
|
||||||
|
|
||||||
|
|
||||||
======================= Release 3.0.0 2009-11-25 =======================
|
======================= Release 3.0.0 2009-11-25 =======================
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,176 @@
|
||||||
|
package org.apache.lucene.analysis.bg;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link Analyzer} for Bulgarian.
|
||||||
|
* <p>
|
||||||
|
* This analyzer implements light-stemming as specified by: <i> Searching
|
||||||
|
* Strategies for the Bulgarian Language </i>
|
||||||
|
* http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
|
||||||
|
* <p>
|
||||||
|
*/
|
||||||
|
public final class BulgarianAnalyzer extends Analyzer {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* File containing default Bulgarian stopwords.
|
||||||
|
*
|
||||||
|
* Default stopword list is from
|
||||||
|
* http://members.unine.ch/jacques.savoy/clef/index.html The stopword list is
|
||||||
|
* BSD-Licensed.
|
||||||
|
*/
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Contains the stopwords used with the StopFilter.
|
||||||
|
*/
|
||||||
|
private final Set<?> stoptable;
|
||||||
|
/**
|
||||||
|
* The comment character in the stopwords file. All lines prefixed with this
|
||||||
|
* will be ignored
|
||||||
|
*/
|
||||||
|
public static final String STOPWORDS_COMMENT = "#";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop-words set.
|
||||||
|
*
|
||||||
|
* @return an unmodifiable instance of the default stop-words set.
|
||||||
|
*/
|
||||||
|
public static Set<String> getDefaultStopSet() {
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer
|
||||||
|
* class accesses the static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final Set<String> DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET = loadDefaultStopWordSet();
|
||||||
|
} catch (Exception ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static Set<String> loadDefaultStopWordSet() throws IOException {
|
||||||
|
final InputStream stream = BulgarianAnalyzer.class
|
||||||
|
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
|
||||||
|
try {
|
||||||
|
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||||
|
// make sure it is unmodifiable as we expose it in the outer class
|
||||||
|
return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
|
||||||
|
STOPWORDS_COMMENT));
|
||||||
|
} finally {
|
||||||
|
if(stream != null)
|
||||||
|
stream.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final Version matchVersion;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the default stop words:
|
||||||
|
* {@link #DEFAULT_STOPWORD_FILE}.
|
||||||
|
*/
|
||||||
|
public BulgarianAnalyzer(Version matchVersion) {
|
||||||
|
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*/
|
||||||
|
public BulgarianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||||
|
super();
|
||||||
|
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
|
||||||
|
stopwords));
|
||||||
|
this.matchVersion = matchVersion;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a {@link TokenStream} which tokenizes all the text in the provided
|
||||||
|
* {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return A {@link TokenStream} built from an {@link StandardTokenizer}
|
||||||
|
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||||
|
* {@link StopFilter}, and {@link BulgarianStemFilter}.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
TokenStream result = new StandardTokenizer(matchVersion, reader);
|
||||||
|
result = new StandardFilter(result);
|
||||||
|
result = new LowerCaseFilter(matchVersion, result);
|
||||||
|
result = new StopFilter(matchVersion, result, stoptable);
|
||||||
|
result = new BulgarianStemFilter(result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private class SavedStreams {
|
||||||
|
Tokenizer source;
|
||||||
|
TokenStream result;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the
|
||||||
|
* text in the provided {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return A {@link TokenStream} built from an {@link StandardTokenizer}
|
||||||
|
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||||
|
* {@link StopFilter}, and {@link BulgarianStemFilter}.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||||
|
throws IOException {
|
||||||
|
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||||
|
if (streams == null) {
|
||||||
|
streams = new SavedStreams();
|
||||||
|
streams.source = new StandardTokenizer(matchVersion, reader);
|
||||||
|
streams.result = new StandardFilter(streams.source);
|
||||||
|
streams.result = new LowerCaseFilter(matchVersion, streams.result);
|
||||||
|
streams.result = new StopFilter(matchVersion, streams.result, stoptable);
|
||||||
|
streams.result = new BulgarianStemFilter(streams.result);
|
||||||
|
setPreviousTokenStream(streams);
|
||||||
|
} else {
|
||||||
|
streams.source.reset(reader);
|
||||||
|
}
|
||||||
|
return streams.result;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,50 @@
|
||||||
|
package org.apache.lucene.analysis.bg;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link BulgarianStemmer} to stem Bulgarian
|
||||||
|
* words.
|
||||||
|
*/
|
||||||
|
public final class BulgarianStemFilter extends TokenFilter {
|
||||||
|
private final BulgarianStemmer stemmer;
|
||||||
|
private final TermAttribute termAtt;
|
||||||
|
|
||||||
|
public BulgarianStemFilter(final TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
stemmer = new BulgarianStemmer();
|
||||||
|
termAtt = addAttribute(TermAttribute.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
final int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
|
||||||
|
termAtt.setTermLength(newlen);
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,152 @@
|
||||||
|
package org.apache.lucene.analysis.bg;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Light Stemmer for Bulgarian.
|
||||||
|
* <p>
|
||||||
|
* Implements the algorithm described in:
|
||||||
|
* <i>
|
||||||
|
* Searching Strategies for the Bulgarian Language
|
||||||
|
* </i>
|
||||||
|
* http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
|
||||||
|
*/
|
||||||
|
public class BulgarianStemmer {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stem an input buffer of Bulgarian text.
|
||||||
|
*
|
||||||
|
* @param s input buffer
|
||||||
|
* @param len length of input buffer
|
||||||
|
* @return length of input buffer after normalization
|
||||||
|
*/
|
||||||
|
public int stem(final char s[], int len) {
|
||||||
|
if (len < 4) // do not stem
|
||||||
|
return len;
|
||||||
|
|
||||||
|
if (len > 5 && endsWith(s, len, "ища"))
|
||||||
|
return len - 3;
|
||||||
|
|
||||||
|
len = removeArticle(s, len);
|
||||||
|
len = removePlural(s, len);
|
||||||
|
|
||||||
|
if (len > 3) {
|
||||||
|
if (endsWith(s, len, "я"))
|
||||||
|
len--;
|
||||||
|
if (endsWith(s, len, "а") ||
|
||||||
|
endsWith(s, len, "о") ||
|
||||||
|
endsWith(s, len, "е"))
|
||||||
|
len--;
|
||||||
|
}
|
||||||
|
|
||||||
|
// the rule to rewrite ен -> н is duplicated in the paper.
|
||||||
|
// in the perl implementation referenced by the paper, this is fixed.
|
||||||
|
// (it is fixed here as well)
|
||||||
|
if (len > 4 && endsWith(s, len, "ен")) {
|
||||||
|
s[len - 2] = 'н'; // replace with н
|
||||||
|
len--;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 5 && s[len - 2] == 'ъ') {
|
||||||
|
s[len - 2] = s[len - 1]; // replace ъN with N
|
||||||
|
len--;
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Mainly remove the definite article
|
||||||
|
* @param s input buffer
|
||||||
|
* @param len length of input buffer
|
||||||
|
* @return new stemmed length
|
||||||
|
*/
|
||||||
|
private int removeArticle(final char s[], final int len) {
|
||||||
|
if (len > 6 && endsWith(s, len, "ият"))
|
||||||
|
return len - 3;
|
||||||
|
|
||||||
|
if (len > 5) {
|
||||||
|
if (endsWith(s, len, "ът") ||
|
||||||
|
endsWith(s, len, "то") ||
|
||||||
|
endsWith(s, len, "те") ||
|
||||||
|
endsWith(s, len, "та") ||
|
||||||
|
endsWith(s, len, "ия"))
|
||||||
|
return len - 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 4 && endsWith(s, len, "ят"))
|
||||||
|
return len - 2;
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int removePlural(final char s[], final int len) {
|
||||||
|
if (len > 6) {
|
||||||
|
if (endsWith(s, len, "овци"))
|
||||||
|
return len - 3; // replace with о
|
||||||
|
if (endsWith(s, len, "ове"))
|
||||||
|
return len - 3;
|
||||||
|
if (endsWith(s, len, "еве")) {
|
||||||
|
s[len - 3] = 'й'; // replace with й
|
||||||
|
return len - 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 5) {
|
||||||
|
if (endsWith(s, len, "ища"))
|
||||||
|
return len - 3;
|
||||||
|
if (endsWith(s, len, "та"))
|
||||||
|
return len - 2;
|
||||||
|
if (endsWith(s, len, "ци")) {
|
||||||
|
s[len - 2] = 'к'; // replace with к
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
if (endsWith(s, len, "зи")) {
|
||||||
|
s[len - 2] = 'г'; // replace with г
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (s[len - 3] == 'е' && s[len - 1] == 'и') {
|
||||||
|
s[len - 3] = 'я'; // replace е with я, remove и
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 4) {
|
||||||
|
if (endsWith(s, len, "си")) {
|
||||||
|
s[len - 2] = 'х'; // replace with х
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
if (endsWith(s, len, "и"))
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean endsWith(final char s[], final int len, final String suffix) {
|
||||||
|
final int suffixLen = suffix.length();
|
||||||
|
if (suffixLen > len)
|
||||||
|
return false;
|
||||||
|
for (int i = suffixLen - 1; i >= 0; i--)
|
||||||
|
if (s[len -(suffixLen - i)] != suffix.charAt(i))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Analyzer for Bulgarian.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,193 @@
|
||||||
|
# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||||
|
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||||
|
а
|
||||||
|
аз
|
||||||
|
ако
|
||||||
|
ала
|
||||||
|
бе
|
||||||
|
без
|
||||||
|
беше
|
||||||
|
би
|
||||||
|
бил
|
||||||
|
била
|
||||||
|
били
|
||||||
|
било
|
||||||
|
близо
|
||||||
|
бъдат
|
||||||
|
бъде
|
||||||
|
бяха
|
||||||
|
в
|
||||||
|
вас
|
||||||
|
ваш
|
||||||
|
ваша
|
||||||
|
вероятно
|
||||||
|
вече
|
||||||
|
взема
|
||||||
|
ви
|
||||||
|
вие
|
||||||
|
винаги
|
||||||
|
все
|
||||||
|
всеки
|
||||||
|
всички
|
||||||
|
всичко
|
||||||
|
всяка
|
||||||
|
във
|
||||||
|
въпреки
|
||||||
|
върху
|
||||||
|
г
|
||||||
|
ги
|
||||||
|
главно
|
||||||
|
го
|
||||||
|
д
|
||||||
|
да
|
||||||
|
дали
|
||||||
|
до
|
||||||
|
докато
|
||||||
|
докога
|
||||||
|
дори
|
||||||
|
досега
|
||||||
|
доста
|
||||||
|
е
|
||||||
|
едва
|
||||||
|
един
|
||||||
|
ето
|
||||||
|
за
|
||||||
|
зад
|
||||||
|
заедно
|
||||||
|
заради
|
||||||
|
засега
|
||||||
|
затова
|
||||||
|
защо
|
||||||
|
защото
|
||||||
|
и
|
||||||
|
из
|
||||||
|
или
|
||||||
|
им
|
||||||
|
има
|
||||||
|
имат
|
||||||
|
иска
|
||||||
|
й
|
||||||
|
каза
|
||||||
|
как
|
||||||
|
каква
|
||||||
|
какво
|
||||||
|
както
|
||||||
|
какъв
|
||||||
|
като
|
||||||
|
кога
|
||||||
|
когато
|
||||||
|
което
|
||||||
|
които
|
||||||
|
кой
|
||||||
|
който
|
||||||
|
колко
|
||||||
|
която
|
||||||
|
къде
|
||||||
|
където
|
||||||
|
към
|
||||||
|
ли
|
||||||
|
м
|
||||||
|
ме
|
||||||
|
между
|
||||||
|
мен
|
||||||
|
ми
|
||||||
|
мнозина
|
||||||
|
мога
|
||||||
|
могат
|
||||||
|
може
|
||||||
|
моля
|
||||||
|
момента
|
||||||
|
му
|
||||||
|
н
|
||||||
|
на
|
||||||
|
над
|
||||||
|
назад
|
||||||
|
най
|
||||||
|
направи
|
||||||
|
напред
|
||||||
|
например
|
||||||
|
нас
|
||||||
|
не
|
||||||
|
него
|
||||||
|
нея
|
||||||
|
ни
|
||||||
|
ние
|
||||||
|
никой
|
||||||
|
нито
|
||||||
|
но
|
||||||
|
някои
|
||||||
|
някой
|
||||||
|
няма
|
||||||
|
обаче
|
||||||
|
около
|
||||||
|
освен
|
||||||
|
особено
|
||||||
|
от
|
||||||
|
отгоре
|
||||||
|
отново
|
||||||
|
още
|
||||||
|
пак
|
||||||
|
по
|
||||||
|
повече
|
||||||
|
повечето
|
||||||
|
под
|
||||||
|
поне
|
||||||
|
поради
|
||||||
|
после
|
||||||
|
почти
|
||||||
|
прави
|
||||||
|
пред
|
||||||
|
преди
|
||||||
|
през
|
||||||
|
при
|
||||||
|
пък
|
||||||
|
първо
|
||||||
|
с
|
||||||
|
са
|
||||||
|
само
|
||||||
|
се
|
||||||
|
сега
|
||||||
|
си
|
||||||
|
скоро
|
||||||
|
след
|
||||||
|
сме
|
||||||
|
според
|
||||||
|
сред
|
||||||
|
срещу
|
||||||
|
сте
|
||||||
|
съм
|
||||||
|
със
|
||||||
|
също
|
||||||
|
т
|
||||||
|
тази
|
||||||
|
така
|
||||||
|
такива
|
||||||
|
такъв
|
||||||
|
там
|
||||||
|
твой
|
||||||
|
те
|
||||||
|
тези
|
||||||
|
ти
|
||||||
|
тн
|
||||||
|
то
|
||||||
|
това
|
||||||
|
тогава
|
||||||
|
този
|
||||||
|
той
|
||||||
|
толкова
|
||||||
|
точно
|
||||||
|
трябва
|
||||||
|
тук
|
||||||
|
тъй
|
||||||
|
тя
|
||||||
|
тях
|
||||||
|
у
|
||||||
|
харесва
|
||||||
|
ч
|
||||||
|
че
|
||||||
|
често
|
||||||
|
чрез
|
||||||
|
ще
|
||||||
|
щом
|
||||||
|
я
|
|
@ -0,0 +1,70 @@
|
||||||
|
package org.apache.lucene.analysis.bg;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Collections;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the Bulgarian analyzer
|
||||||
|
*/
|
||||||
|
public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This test fails with NPE when the stopwords file is missing in classpath
|
||||||
|
*/
|
||||||
|
public void testResourcesAvailable() {
|
||||||
|
new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testStopwords() throws IOException {
|
||||||
|
Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
assertAnalyzesTo(a, "Как се казваш?", new String[] {"казваш"});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testCustomStopwords() throws IOException {
|
||||||
|
Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT, Collections
|
||||||
|
.emptySet());
|
||||||
|
assertAnalyzesTo(a, "Как се казваш?",
|
||||||
|
new String[] {"как", "се", "казваш"});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testReusableTokenStream() throws IOException {
|
||||||
|
Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
assertAnalyzesToReuse(a, "документи", new String[] {"документ"});
|
||||||
|
assertAnalyzesToReuse(a, "документ", new String[] {"документ"});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test some examples from the paper
|
||||||
|
*/
|
||||||
|
public void testBasicExamples() throws IOException {
|
||||||
|
Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
assertAnalyzesTo(a, "енергийни кризи", new String[] {"енергийн", "криз"});
|
||||||
|
assertAnalyzesTo(a, "Атомната енергия", new String[] {"атомн", "енерг"});
|
||||||
|
|
||||||
|
assertAnalyzesTo(a, "компютри", new String[] {"компютр"});
|
||||||
|
assertAnalyzesTo(a, "компютър", new String[] {"компютр"});
|
||||||
|
|
||||||
|
assertAnalyzesTo(a, "градове", new String[] {"град"});
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,210 @@
|
||||||
|
package org.apache.lucene.analysis.bg;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the Bulgarian Stemmer
|
||||||
|
*/
|
||||||
|
public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
|
||||||
|
/**
|
||||||
|
* Test showing how masculine noun forms conflate. An example noun for each
|
||||||
|
* common (and some rare) plural pattern is listed.
|
||||||
|
*/
|
||||||
|
public void testMasculineNouns() throws IOException {
|
||||||
|
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
|
||||||
|
// -и pattern
|
||||||
|
assertAnalyzesTo(a, "град", new String[] {"град"});
|
||||||
|
assertAnalyzesTo(a, "града", new String[] {"град"});
|
||||||
|
assertAnalyzesTo(a, "градът", new String[] {"град"});
|
||||||
|
assertAnalyzesTo(a, "градове", new String[] {"град"});
|
||||||
|
assertAnalyzesTo(a, "градовете", new String[] {"град"});
|
||||||
|
|
||||||
|
// -ове pattern
|
||||||
|
assertAnalyzesTo(a, "народ", new String[] {"народ"});
|
||||||
|
assertAnalyzesTo(a, "народа", new String[] {"народ"});
|
||||||
|
assertAnalyzesTo(a, "народът", new String[] {"народ"});
|
||||||
|
assertAnalyzesTo(a, "народи", new String[] {"народ"});
|
||||||
|
assertAnalyzesTo(a, "народите", new String[] {"народ"});
|
||||||
|
assertAnalyzesTo(a, "народе", new String[] {"народ"});
|
||||||
|
|
||||||
|
// -ища pattern
|
||||||
|
assertAnalyzesTo(a, "път", new String[] {"път"});
|
||||||
|
assertAnalyzesTo(a, "пътя", new String[] {"път"});
|
||||||
|
assertAnalyzesTo(a, "пътят", new String[] {"път"});
|
||||||
|
assertAnalyzesTo(a, "пътища", new String[] {"път"});
|
||||||
|
assertAnalyzesTo(a, "пътищата", new String[] {"път"});
|
||||||
|
|
||||||
|
// -чета pattern
|
||||||
|
assertAnalyzesTo(a, "градец", new String[] {"градец"});
|
||||||
|
assertAnalyzesTo(a, "градеца", new String[] {"градец"});
|
||||||
|
assertAnalyzesTo(a, "градецът", new String[] {"градец"});
|
||||||
|
/* note the below forms conflate with each other, but not the rest */
|
||||||
|
assertAnalyzesTo(a, "градовце", new String[] {"градовц"});
|
||||||
|
assertAnalyzesTo(a, "градовцете", new String[] {"градовц"});
|
||||||
|
|
||||||
|
// -овци pattern
|
||||||
|
assertAnalyzesTo(a, "дядо", new String[] {"дяд"});
|
||||||
|
assertAnalyzesTo(a, "дядото", new String[] {"дяд"});
|
||||||
|
assertAnalyzesTo(a, "дядовци", new String[] {"дяд"});
|
||||||
|
assertAnalyzesTo(a, "дядовците", new String[] {"дяд"});
|
||||||
|
|
||||||
|
// -е pattern
|
||||||
|
assertAnalyzesTo(a, "мъж", new String[] {"мъж"});
|
||||||
|
assertAnalyzesTo(a, "мъжа", new String[] {"мъж"});
|
||||||
|
assertAnalyzesTo(a, "мъже", new String[] {"мъж"});
|
||||||
|
assertAnalyzesTo(a, "мъжете", new String[] {"мъж"});
|
||||||
|
assertAnalyzesTo(a, "мъжо", new String[] {"мъж"});
|
||||||
|
/* word is too short, will not remove -ът */
|
||||||
|
assertAnalyzesTo(a, "мъжът", new String[] {"мъжът"});
|
||||||
|
|
||||||
|
// -а pattern
|
||||||
|
assertAnalyzesTo(a, "крак", new String[] {"крак"});
|
||||||
|
assertAnalyzesTo(a, "крака", new String[] {"крак"});
|
||||||
|
assertAnalyzesTo(a, "кракът", new String[] {"крак"});
|
||||||
|
assertAnalyzesTo(a, "краката", new String[] {"крак"});
|
||||||
|
|
||||||
|
// брат
|
||||||
|
assertAnalyzesTo(a, "брат", new String[] {"брат"});
|
||||||
|
assertAnalyzesTo(a, "брата", new String[] {"брат"});
|
||||||
|
assertAnalyzesTo(a, "братът", new String[] {"брат"});
|
||||||
|
assertAnalyzesTo(a, "братя", new String[] {"брат"});
|
||||||
|
assertAnalyzesTo(a, "братята", new String[] {"брат"});
|
||||||
|
assertAnalyzesTo(a, "брате", new String[] {"брат"});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test showing how feminine noun forms conflate
|
||||||
|
*/
|
||||||
|
public void testFeminineNouns() throws IOException {
|
||||||
|
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
|
||||||
|
assertAnalyzesTo(a, "вест", new String[] {"вест"});
|
||||||
|
assertAnalyzesTo(a, "вестта", new String[] {"вест"});
|
||||||
|
assertAnalyzesTo(a, "вести", new String[] {"вест"});
|
||||||
|
assertAnalyzesTo(a, "вестите", new String[] {"вест"});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test showing how neuter noun forms conflate an example noun for each common
|
||||||
|
* plural pattern is listed
|
||||||
|
*/
|
||||||
|
public void testNeuterNouns() throws IOException {
|
||||||
|
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
|
||||||
|
// -а pattern
|
||||||
|
assertAnalyzesTo(a, "дърво", new String[] {"дърв"});
|
||||||
|
assertAnalyzesTo(a, "дървото", new String[] {"дърв"});
|
||||||
|
assertAnalyzesTo(a, "дърва", new String[] {"дърв"});
|
||||||
|
assertAnalyzesTo(a, "дървета", new String[] {"дърв"});
|
||||||
|
assertAnalyzesTo(a, "дървата", new String[] {"дърв"});
|
||||||
|
assertAnalyzesTo(a, "дърветата", new String[] {"дърв"});
|
||||||
|
|
||||||
|
// -та pattern
|
||||||
|
assertAnalyzesTo(a, "море", new String[] {"мор"});
|
||||||
|
assertAnalyzesTo(a, "морето", new String[] {"мор"});
|
||||||
|
assertAnalyzesTo(a, "морета", new String[] {"мор"});
|
||||||
|
assertAnalyzesTo(a, "моретата", new String[] {"мор"});
|
||||||
|
|
||||||
|
// -я pattern
|
||||||
|
assertAnalyzesTo(a, "изключение", new String[] {"изключени"});
|
||||||
|
assertAnalyzesTo(a, "изключението", new String[] {"изключени"});
|
||||||
|
assertAnalyzesTo(a, "изключенията", new String[] {"изключени"});
|
||||||
|
/* note the below form in this example does not conflate with the rest */
|
||||||
|
assertAnalyzesTo(a, "изключения", new String[] {"изключн"});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test showing how adjectival forms conflate
|
||||||
|
*/
|
||||||
|
public void testAdjectives() throws IOException {
|
||||||
|
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
assertAnalyzesTo(a, "красив", new String[] {"красив"});
|
||||||
|
assertAnalyzesTo(a, "красивия", new String[] {"красив"});
|
||||||
|
assertAnalyzesTo(a, "красивият", new String[] {"красив"});
|
||||||
|
assertAnalyzesTo(a, "красива", new String[] {"красив"});
|
||||||
|
assertAnalyzesTo(a, "красивата", new String[] {"красив"});
|
||||||
|
assertAnalyzesTo(a, "красиво", new String[] {"красив"});
|
||||||
|
assertAnalyzesTo(a, "красивото", new String[] {"красив"});
|
||||||
|
assertAnalyzesTo(a, "красиви", new String[] {"красив"});
|
||||||
|
assertAnalyzesTo(a, "красивите", new String[] {"красив"});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test some exceptional rules, implemented as rewrites.
|
||||||
|
*/
|
||||||
|
public void testExceptions() throws IOException {
|
||||||
|
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
|
||||||
|
// ци -> к
|
||||||
|
assertAnalyzesTo(a, "собственик", new String[] {"собственик"});
|
||||||
|
assertAnalyzesTo(a, "собственика", new String[] {"собственик"});
|
||||||
|
assertAnalyzesTo(a, "собственикът", new String[] {"собственик"});
|
||||||
|
assertAnalyzesTo(a, "собственици", new String[] {"собственик"});
|
||||||
|
assertAnalyzesTo(a, "собствениците", new String[] {"собственик"});
|
||||||
|
|
||||||
|
// зи -> г
|
||||||
|
assertAnalyzesTo(a, "подлог", new String[] {"подлог"});
|
||||||
|
assertAnalyzesTo(a, "подлога", new String[] {"подлог"});
|
||||||
|
assertAnalyzesTo(a, "подлогът", new String[] {"подлог"});
|
||||||
|
assertAnalyzesTo(a, "подлози", new String[] {"подлог"});
|
||||||
|
assertAnalyzesTo(a, "подлозите", new String[] {"подлог"});
|
||||||
|
|
||||||
|
// си -> х
|
||||||
|
assertAnalyzesTo(a, "кожух", new String[] {"кожух"});
|
||||||
|
assertAnalyzesTo(a, "кожуха", new String[] {"кожух"});
|
||||||
|
assertAnalyzesTo(a, "кожухът", new String[] {"кожух"});
|
||||||
|
assertAnalyzesTo(a, "кожуси", new String[] {"кожух"});
|
||||||
|
assertAnalyzesTo(a, "кожусите", new String[] {"кожух"});
|
||||||
|
|
||||||
|
// ъ deletion
|
||||||
|
assertAnalyzesTo(a, "център", new String[] {"центр"});
|
||||||
|
assertAnalyzesTo(a, "центъра", new String[] {"центр"});
|
||||||
|
assertAnalyzesTo(a, "центърът", new String[] {"центр"});
|
||||||
|
assertAnalyzesTo(a, "центрове", new String[] {"центр"});
|
||||||
|
assertAnalyzesTo(a, "центровете", new String[] {"центр"});
|
||||||
|
|
||||||
|
// е*и -> я*
|
||||||
|
assertAnalyzesTo(a, "промяна", new String[] {"промян"});
|
||||||
|
assertAnalyzesTo(a, "промяната", new String[] {"промян"});
|
||||||
|
assertAnalyzesTo(a, "промени", new String[] {"промян"});
|
||||||
|
assertAnalyzesTo(a, "промените", new String[] {"промян"});
|
||||||
|
|
||||||
|
// ен -> н
|
||||||
|
assertAnalyzesTo(a, "песен", new String[] {"песн"});
|
||||||
|
assertAnalyzesTo(a, "песента", new String[] {"песн"});
|
||||||
|
assertAnalyzesTo(a, "песни", new String[] {"песн"});
|
||||||
|
assertAnalyzesTo(a, "песните", new String[] {"песн"});
|
||||||
|
|
||||||
|
// -еве -> й
|
||||||
|
// note: this is the only word i think this rule works for.
|
||||||
|
// most -еве pluralized nouns are monosyllabic,
|
||||||
|
// and the stemmer requires length > 6...
|
||||||
|
assertAnalyzesTo(a, "строй", new String[] {"строй"});
|
||||||
|
assertAnalyzesTo(a, "строеве", new String[] {"строй"});
|
||||||
|
assertAnalyzesTo(a, "строевете", new String[] {"строй"});
|
||||||
|
/* note the below forms conflate with each other, but not the rest */
|
||||||
|
assertAnalyzesTo(a, "строя", new String[] {"стр"});
|
||||||
|
assertAnalyzesTo(a, "строят", new String[] {"стр"});
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue