mirror of https://github.com/apache/lucene.git
LUCENE-3016: add analyzer for Latvian
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1092396 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9b8cfb80b5
commit
c3f6331639
|
@ -50,6 +50,10 @@ Bug fixes
|
||||||
* LUCENE-3026: SmartChineseAnalyzer's WordTokenFilter threw NullPointerException
|
* LUCENE-3026: SmartChineseAnalyzer's WordTokenFilter threw NullPointerException
|
||||||
on sentences longer than 32,767 characters. (wangzhenghang via Robert Muir)
|
on sentences longer than 32,767 characters. (wangzhenghang via Robert Muir)
|
||||||
|
|
||||||
|
New Features
|
||||||
|
|
||||||
|
* LUCENE-3016: Add analyzer for Latvian. (Robert Muir)
|
||||||
|
|
||||||
======================= Lucene 3.1.0 =======================
|
======================= Lucene 3.1.0 =======================
|
||||||
|
|
||||||
Changes in backwards compatibility policy
|
Changes in backwards compatibility policy
|
||||||
|
|
|
@ -0,0 +1,129 @@
|
||||||
|
package org.apache.lucene.analysis.lv;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.core.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||||
|
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link Analyzer} for Latvian.
|
||||||
|
*/
|
||||||
|
public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
private final Set<?> stemExclusionSet;
|
||||||
|
|
||||||
|
/** File containing default Latvian stopwords. */
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
|
* @return default stop words set.
|
||||||
|
*/
|
||||||
|
public static Set<?> getDefaultStopSet(){
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||||
|
* accesses the static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final Set<?> DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET = WordlistLoader.getWordSet(LatvianAnalyzer.class,
|
||||||
|
DEFAULT_STOPWORD_FILE);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||||
|
*/
|
||||||
|
public LatvianAnalyzer(Version matchVersion) {
|
||||||
|
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
*/
|
||||||
|
public LatvianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||||
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||||
|
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||||
|
* stemming.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
|
*/
|
||||||
|
public LatvianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||||
|
super(matchVersion, stopwords);
|
||||||
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
|
matchVersion, stemExclusionSet));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a
|
||||||
|
* {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* which tokenizes all the text in the provided {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return A
|
||||||
|
* {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* built from an {@link StandardTokenizer} filtered with
|
||||||
|
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||||
|
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||||
|
* provided and {@link LatvianStemFilter}.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||||
|
TokenStream result = new StandardFilter(matchVersion, source);
|
||||||
|
result = new LowerCaseFilter(matchVersion, result);
|
||||||
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
|
if(!stemExclusionSet.isEmpty())
|
||||||
|
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||||
|
result = new LatvianStemFilter(result);
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,58 @@
|
||||||
|
package org.apache.lucene.analysis.lv;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link LatvianStemmer} to stem Latvian
|
||||||
|
* words.
|
||||||
|
* <p>
|
||||||
|
* To prevent terms from being stemmed use an instance of
|
||||||
|
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||||
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public final class LatvianStemFilter extends TokenFilter {
|
||||||
|
private final LatvianStemmer stemmer = new LatvianStemmer();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
public LatvianStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAttr.isKeyword()) {
|
||||||
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
|
termAtt.setLength(newlen);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,174 @@
|
||||||
|
package org.apache.lucene.analysis.lv;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Light stemmer for Latvian.
|
||||||
|
* <p>
|
||||||
|
* This is a light version of the algorithm in Karlis Kreslin's PhD thesis
|
||||||
|
* <i>A stemming algorithm for Latvian</i> with the following modifications:
|
||||||
|
* <ul>
|
||||||
|
* <li>Only explicitly stems noun and adjective morphology
|
||||||
|
* <li>Stricter length/vowel checks for the resulting stems (verb etc suffix stripping is removed)
|
||||||
|
* <li>Removes only the primary inflectional suffixes: case and number for nouns ;
|
||||||
|
* case, number, gender, and definitiveness for adjectives.
|
||||||
|
* <li>Palatalization is only handled when a declension II,V,VI noun suffix is removed.
|
||||||
|
* </ul>
|
||||||
|
*/
|
||||||
|
public class LatvianStemmer {
|
||||||
|
/**
|
||||||
|
* Stem a latvian word. returns the new adjusted length.
|
||||||
|
*/
|
||||||
|
public int stem(char s[], int len) {
|
||||||
|
int numVowels = numVowels(s, len);
|
||||||
|
|
||||||
|
for (int i = 0; i < affixes.length; i++) {
|
||||||
|
Affix affix = affixes[i];
|
||||||
|
if (numVowels > affix.vc && len >= affix.affix.length + 3 && endsWith(s, len, affix.affix)) {
|
||||||
|
len -= affix.affix.length;
|
||||||
|
return affix.palatalizes ? unpalatalize(s, len) : len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
static final Affix affixes[] = {
|
||||||
|
new Affix("ajiem", 3, false), new Affix("ajai", 3, false),
|
||||||
|
new Affix("ajam", 2, false), new Affix("ajām", 2, false),
|
||||||
|
new Affix("ajos", 2, false), new Affix("ajās", 2, false),
|
||||||
|
new Affix("iem", 2, true), new Affix("ajā", 2, false),
|
||||||
|
new Affix("ais", 2, false), new Affix("ai", 2, false),
|
||||||
|
new Affix("ei", 2, false), new Affix("ām", 1, false),
|
||||||
|
new Affix("am", 1, false), new Affix("ēm", 1, false),
|
||||||
|
new Affix("īm", 1, false), new Affix("im", 1, false),
|
||||||
|
new Affix("um", 1, false), new Affix("us", 1, true),
|
||||||
|
new Affix("as", 1, false), new Affix("ās", 1, false),
|
||||||
|
new Affix("es", 1, false), new Affix("os", 1, true),
|
||||||
|
new Affix("ij", 1, false), new Affix("īs", 1, false),
|
||||||
|
new Affix("ēs", 1, false), new Affix("is", 1, false),
|
||||||
|
new Affix("ie", 1, false), new Affix("u", 1, true),
|
||||||
|
new Affix("a", 1, true), new Affix("i", 1, true),
|
||||||
|
new Affix("e", 1, false), new Affix("ā", 1, false),
|
||||||
|
new Affix("ē", 1, false), new Affix("ī", 1, false),
|
||||||
|
new Affix("ū", 1, false), new Affix("o", 1, false),
|
||||||
|
new Affix("s", 0, false), new Affix("š", 0, false),
|
||||||
|
};
|
||||||
|
|
||||||
|
static class Affix {
|
||||||
|
char affix[]; // suffix
|
||||||
|
int vc; // vowel count of the suffix
|
||||||
|
boolean palatalizes; // true if we should fire palatalization rules.
|
||||||
|
|
||||||
|
Affix(String affix, int vc, boolean palatalizes) {
|
||||||
|
this.affix = affix.toCharArray();
|
||||||
|
this.vc = vc;
|
||||||
|
this.palatalizes = palatalizes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Most cases are handled except for the ambiguous ones:
|
||||||
|
* <ul>
|
||||||
|
* <li> s -> š
|
||||||
|
* <li> t -> š
|
||||||
|
* <li> d -> ž
|
||||||
|
* <li> z -> ž
|
||||||
|
* </ul>
|
||||||
|
*/
|
||||||
|
private int unpalatalize(char s[], int len) {
|
||||||
|
// we check the character removed: if its -u then
|
||||||
|
// its 2,5, or 6 gen pl., and these two can only apply then.
|
||||||
|
if (s[len] == 'u') {
|
||||||
|
// kš -> kst
|
||||||
|
if (endsWith(s, len, "kš")) {
|
||||||
|
len++;
|
||||||
|
s[len-2] = 's';
|
||||||
|
s[len-1] = 't';
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
// ņņ -> nn
|
||||||
|
if (endsWith(s, len, "ņņ")) {
|
||||||
|
s[len-2] = 'n';
|
||||||
|
s[len-1] = 'n';
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// otherwise all other rules
|
||||||
|
if (endsWith(s, len, "pj") || endsWith(s, len, "bj")
|
||||||
|
|| endsWith(s, len, "mj") || endsWith(s, len, "vj")) {
|
||||||
|
// labial consonant
|
||||||
|
return len-1;
|
||||||
|
} else if (endsWith(s, len, "šņ")) {
|
||||||
|
s[len-2] = 's';
|
||||||
|
s[len-1] = 'n';
|
||||||
|
return len;
|
||||||
|
} else if (endsWith(s, len, "žņ")) {
|
||||||
|
s[len-2] = 'z';
|
||||||
|
s[len-1] = 'n';
|
||||||
|
return len;
|
||||||
|
} else if (endsWith(s, len, "šļ")) {
|
||||||
|
s[len-2] = 's';
|
||||||
|
s[len-1] = 'l';
|
||||||
|
return len;
|
||||||
|
} else if (endsWith(s, len, "žļ")) {
|
||||||
|
s[len-2] = 'z';
|
||||||
|
s[len-1] = 'l';
|
||||||
|
return len;
|
||||||
|
} else if (endsWith(s, len, "ļņ")) {
|
||||||
|
s[len-2] = 'l';
|
||||||
|
s[len-1] = 'n';
|
||||||
|
return len;
|
||||||
|
} else if (endsWith(s, len, "ļļ")) {
|
||||||
|
s[len-2] = 'l';
|
||||||
|
s[len-1] = 'l';
|
||||||
|
return len;
|
||||||
|
} else if (s[len-1] == 'č') {
|
||||||
|
s[len-1] = 'c';
|
||||||
|
return len;
|
||||||
|
} else if (s[len-1] == 'ļ') {
|
||||||
|
s[len-1] = 'l';
|
||||||
|
return len;
|
||||||
|
} else if (s[len-1] == 'ņ') {
|
||||||
|
s[len-1] = 'n';
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Count the vowels in the string, we always require at least
|
||||||
|
* one in the remaining stem to accept it.
|
||||||
|
*/
|
||||||
|
private int numVowels(char s[], int len) {
|
||||||
|
int n = 0;
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
switch(s[i]) {
|
||||||
|
case 'a': case 'e': case 'i':
|
||||||
|
case 'o': case 'u': case 'ā':
|
||||||
|
case 'ī': case 'ē': case 'ū':
|
||||||
|
n++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Analyzer for Latvian.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,172 @@
|
||||||
|
# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins
|
||||||
|
# the original list of over 800 forms was refined:
|
||||||
|
# pronouns, adverbs, interjections were removed
|
||||||
|
#
|
||||||
|
# prepositions
|
||||||
|
aiz
|
||||||
|
ap
|
||||||
|
ar
|
||||||
|
apakš
|
||||||
|
ārpus
|
||||||
|
augšpus
|
||||||
|
bez
|
||||||
|
caur
|
||||||
|
dēļ
|
||||||
|
gar
|
||||||
|
iekš
|
||||||
|
iz
|
||||||
|
kopš
|
||||||
|
labad
|
||||||
|
lejpus
|
||||||
|
līdz
|
||||||
|
no
|
||||||
|
otrpus
|
||||||
|
pa
|
||||||
|
par
|
||||||
|
pār
|
||||||
|
pēc
|
||||||
|
pie
|
||||||
|
pirms
|
||||||
|
pret
|
||||||
|
priekš
|
||||||
|
starp
|
||||||
|
šaipus
|
||||||
|
uz
|
||||||
|
viņpus
|
||||||
|
virs
|
||||||
|
virspus
|
||||||
|
zem
|
||||||
|
apakšpus
|
||||||
|
# Conjunctions
|
||||||
|
un
|
||||||
|
bet
|
||||||
|
jo
|
||||||
|
ja
|
||||||
|
ka
|
||||||
|
lai
|
||||||
|
tomēr
|
||||||
|
tikko
|
||||||
|
turpretī
|
||||||
|
arī
|
||||||
|
kaut
|
||||||
|
gan
|
||||||
|
tādēļ
|
||||||
|
tā
|
||||||
|
ne
|
||||||
|
tikvien
|
||||||
|
vien
|
||||||
|
kā
|
||||||
|
ir
|
||||||
|
te
|
||||||
|
vai
|
||||||
|
kamēr
|
||||||
|
# Particles
|
||||||
|
ar
|
||||||
|
diezin
|
||||||
|
droši
|
||||||
|
diemžēl
|
||||||
|
nebūt
|
||||||
|
ik
|
||||||
|
it
|
||||||
|
taču
|
||||||
|
nu
|
||||||
|
pat
|
||||||
|
tiklab
|
||||||
|
iekšpus
|
||||||
|
nedz
|
||||||
|
tik
|
||||||
|
nevis
|
||||||
|
turpretim
|
||||||
|
jeb
|
||||||
|
iekam
|
||||||
|
iekām
|
||||||
|
iekāms
|
||||||
|
kolīdz
|
||||||
|
līdzko
|
||||||
|
tiklīdz
|
||||||
|
jebšu
|
||||||
|
tālab
|
||||||
|
tāpēc
|
||||||
|
nekā
|
||||||
|
itin
|
||||||
|
jā
|
||||||
|
jau
|
||||||
|
jel
|
||||||
|
nē
|
||||||
|
nezin
|
||||||
|
tad
|
||||||
|
tikai
|
||||||
|
vis
|
||||||
|
tak
|
||||||
|
iekams
|
||||||
|
vien
|
||||||
|
# modal verbs
|
||||||
|
būt
|
||||||
|
biju
|
||||||
|
biji
|
||||||
|
bija
|
||||||
|
bijām
|
||||||
|
bijāt
|
||||||
|
esmu
|
||||||
|
esi
|
||||||
|
esam
|
||||||
|
esat
|
||||||
|
būšu
|
||||||
|
būsi
|
||||||
|
būs
|
||||||
|
būsim
|
||||||
|
būsiet
|
||||||
|
tikt
|
||||||
|
tiku
|
||||||
|
tiki
|
||||||
|
tika
|
||||||
|
tikām
|
||||||
|
tikāt
|
||||||
|
tieku
|
||||||
|
tiec
|
||||||
|
tiek
|
||||||
|
tiekam
|
||||||
|
tiekat
|
||||||
|
tikšu
|
||||||
|
tiks
|
||||||
|
tiksim
|
||||||
|
tiksiet
|
||||||
|
tapt
|
||||||
|
tapi
|
||||||
|
tapāt
|
||||||
|
topat
|
||||||
|
tapšu
|
||||||
|
tapsi
|
||||||
|
taps
|
||||||
|
tapsim
|
||||||
|
tapsiet
|
||||||
|
kļūt
|
||||||
|
kļuvu
|
||||||
|
kļuvi
|
||||||
|
kļuva
|
||||||
|
kļuvām
|
||||||
|
kļuvāt
|
||||||
|
kļūstu
|
||||||
|
kļūsti
|
||||||
|
kļūst
|
||||||
|
kļūstam
|
||||||
|
kļūstat
|
||||||
|
kļūšu
|
||||||
|
kļūsi
|
||||||
|
kļūs
|
||||||
|
kļūsim
|
||||||
|
kļūsiet
|
||||||
|
# verbs
|
||||||
|
varēt
|
||||||
|
varēju
|
||||||
|
varējām
|
||||||
|
varēšu
|
||||||
|
varēsim
|
||||||
|
var
|
||||||
|
varēji
|
||||||
|
varējāt
|
||||||
|
varēsi
|
||||||
|
varēsiet
|
||||||
|
varat
|
||||||
|
varēja
|
||||||
|
varēs
|
|
@ -0,0 +1,53 @@
|
||||||
|
package org.apache.lucene.analysis.lv;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
|
||||||
|
public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
/** This test fails with NPE when the
|
||||||
|
* stopwords file is missing in classpath */
|
||||||
|
public void testResourcesAvailable() {
|
||||||
|
new LatvianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test stopwords and stemming */
|
||||||
|
public void testBasics() throws IOException {
|
||||||
|
Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
// stemming
|
||||||
|
checkOneTermReuse(a, "tirgiem", "tirg");
|
||||||
|
checkOneTermReuse(a, "tirgus", "tirg");
|
||||||
|
// stopword
|
||||||
|
assertAnalyzesTo(a, "un", new String[] {});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test use of exclusion set */
|
||||||
|
public void testExclude() throws IOException {
|
||||||
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
|
exclusionSet.add("tirgiem");
|
||||||
|
Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT,
|
||||||
|
LatvianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
|
checkOneTermReuse(a, "tirgiem", "tirgiem");
|
||||||
|
checkOneTermReuse(a, "tirgus", "tirg");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,272 @@
|
||||||
|
package org.apache.lucene.analysis.lv;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Basic tests for {@link LatvianStemmer}
|
||||||
|
*/
|
||||||
|
public class TestLatvianStemmer extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer a = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
|
return new TokenStreamComponents(tokenizer, new LatvianStemFilter(tokenizer));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
public void testNouns1() throws IOException {
|
||||||
|
// decl. I
|
||||||
|
checkOneTerm(a, "tēvs", "tēv"); // nom. sing.
|
||||||
|
checkOneTerm(a, "tēvi", "tēv"); // nom. pl.
|
||||||
|
checkOneTerm(a, "tēva", "tēv"); // gen. sing.
|
||||||
|
checkOneTerm(a, "tēvu", "tēv"); // gen. pl.
|
||||||
|
checkOneTerm(a, "tēvam", "tēv"); // dat. sing.
|
||||||
|
checkOneTerm(a, "tēviem", "tēv"); // dat. pl.
|
||||||
|
checkOneTerm(a, "tēvu", "tēv"); // acc. sing.
|
||||||
|
checkOneTerm(a, "tēvus", "tēv"); // acc. pl.
|
||||||
|
checkOneTerm(a, "tēvā", "tēv"); // loc. sing.
|
||||||
|
checkOneTerm(a, "tēvos", "tēv"); // loc. pl.
|
||||||
|
checkOneTerm(a, "tēvs", "tēv"); // voc. sing.
|
||||||
|
checkOneTerm(a, "tēvi", "tēv"); // voc. pl.
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* decl II nouns with (s,t) -> š and (d,z) -> ž
|
||||||
|
* palatalization will generally conflate to two stems
|
||||||
|
* due to the ambiguity (plural and singular).
|
||||||
|
*/
|
||||||
|
public void testNouns2() throws IOException {
|
||||||
|
// decl. II
|
||||||
|
|
||||||
|
// c -> č palatalization
|
||||||
|
checkOneTerm(a, "lācis", "lāc"); // nom. sing.
|
||||||
|
checkOneTerm(a, "lāči", "lāc"); // nom. pl.
|
||||||
|
checkOneTerm(a, "lāča", "lāc"); // gen. sing.
|
||||||
|
checkOneTerm(a, "lāču", "lāc"); // gen. pl.
|
||||||
|
checkOneTerm(a, "lācim", "lāc"); // dat. sing.
|
||||||
|
checkOneTerm(a, "lāčiem", "lāc"); // dat. pl.
|
||||||
|
checkOneTerm(a, "lāci", "lāc"); // acc. sing.
|
||||||
|
checkOneTerm(a, "lāčus", "lāc"); // acc. pl.
|
||||||
|
checkOneTerm(a, "lācī", "lāc"); // loc. sing.
|
||||||
|
checkOneTerm(a, "lāčos", "lāc"); // loc. pl.
|
||||||
|
checkOneTerm(a, "lāci", "lāc"); // voc. sing.
|
||||||
|
checkOneTerm(a, "lāči", "lāc"); // voc. pl.
|
||||||
|
|
||||||
|
// n -> ņ palatalization
|
||||||
|
checkOneTerm(a, "akmens", "akmen"); // nom. sing.
|
||||||
|
checkOneTerm(a, "akmeņi", "akmen"); // nom. pl.
|
||||||
|
checkOneTerm(a, "akmens", "akmen"); // gen. sing.
|
||||||
|
checkOneTerm(a, "akmeņu", "akmen"); // gen. pl.
|
||||||
|
checkOneTerm(a, "akmenim", "akmen"); // dat. sing.
|
||||||
|
checkOneTerm(a, "akmeņiem", "akmen"); // dat. pl.
|
||||||
|
checkOneTerm(a, "akmeni", "akmen"); // acc. sing.
|
||||||
|
checkOneTerm(a, "akmeņus", "akmen"); // acc. pl.
|
||||||
|
checkOneTerm(a, "akmenī", "akmen"); // loc. sing.
|
||||||
|
checkOneTerm(a, "akmeņos", "akmen"); // loc. pl.
|
||||||
|
checkOneTerm(a, "akmens", "akmen"); // voc. sing.
|
||||||
|
checkOneTerm(a, "akmeņi", "akmen"); // voc. pl.
|
||||||
|
|
||||||
|
// no palatalization
|
||||||
|
checkOneTerm(a, "kurmis", "kurm"); // nom. sing.
|
||||||
|
checkOneTerm(a, "kurmji", "kurm"); // nom. pl.
|
||||||
|
checkOneTerm(a, "kurmja", "kurm"); // gen. sing.
|
||||||
|
checkOneTerm(a, "kurmju", "kurm"); // gen. pl.
|
||||||
|
checkOneTerm(a, "kurmim", "kurm"); // dat. sing.
|
||||||
|
checkOneTerm(a, "kurmjiem", "kurm"); // dat. pl.
|
||||||
|
checkOneTerm(a, "kurmi", "kurm"); // acc. sing.
|
||||||
|
checkOneTerm(a, "kurmjus", "kurm"); // acc. pl.
|
||||||
|
checkOneTerm(a, "kurmī", "kurm"); // loc. sing.
|
||||||
|
checkOneTerm(a, "kurmjos", "kurm"); // loc. pl.
|
||||||
|
checkOneTerm(a, "kurmi", "kurm"); // voc. sing.
|
||||||
|
checkOneTerm(a, "kurmji", "kurm"); // voc. pl.
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNouns3() throws IOException {
|
||||||
|
// decl III
|
||||||
|
checkOneTerm(a, "lietus", "liet"); // nom. sing.
|
||||||
|
checkOneTerm(a, "lieti", "liet"); // nom. pl.
|
||||||
|
checkOneTerm(a, "lietus", "liet"); // gen. sing.
|
||||||
|
checkOneTerm(a, "lietu", "liet"); // gen. pl.
|
||||||
|
checkOneTerm(a, "lietum", "liet"); // dat. sing.
|
||||||
|
checkOneTerm(a, "lietiem", "liet"); // dat. pl.
|
||||||
|
checkOneTerm(a, "lietu", "liet"); // acc. sing.
|
||||||
|
checkOneTerm(a, "lietus", "liet"); // acc. pl.
|
||||||
|
checkOneTerm(a, "lietū", "liet"); // loc. sing.
|
||||||
|
checkOneTerm(a, "lietos", "liet"); // loc. pl.
|
||||||
|
checkOneTerm(a, "lietus", "liet"); // voc. sing.
|
||||||
|
checkOneTerm(a, "lieti", "liet"); // voc. pl.
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNouns4() throws IOException {
|
||||||
|
// decl IV
|
||||||
|
checkOneTerm(a, "lapa", "lap"); // nom. sing.
|
||||||
|
checkOneTerm(a, "lapas", "lap"); // nom. pl.
|
||||||
|
checkOneTerm(a, "lapas", "lap"); // gen. sing.
|
||||||
|
checkOneTerm(a, "lapu", "lap"); // gen. pl.
|
||||||
|
checkOneTerm(a, "lapai", "lap"); // dat. sing.
|
||||||
|
checkOneTerm(a, "lapām", "lap"); // dat. pl.
|
||||||
|
checkOneTerm(a, "lapu", "lap"); // acc. sing.
|
||||||
|
checkOneTerm(a, "lapas", "lap"); // acc. pl.
|
||||||
|
checkOneTerm(a, "lapā", "lap"); // loc. sing.
|
||||||
|
checkOneTerm(a, "lapās", "lap"); // loc. pl.
|
||||||
|
checkOneTerm(a, "lapa", "lap"); // voc. sing.
|
||||||
|
checkOneTerm(a, "lapas", "lap"); // voc. pl.
|
||||||
|
|
||||||
|
checkOneTerm(a, "puika", "puik"); // nom. sing.
|
||||||
|
checkOneTerm(a, "puikas", "puik"); // nom. pl.
|
||||||
|
checkOneTerm(a, "puikas", "puik"); // gen. sing.
|
||||||
|
checkOneTerm(a, "puiku", "puik"); // gen. pl.
|
||||||
|
checkOneTerm(a, "puikam", "puik"); // dat. sing.
|
||||||
|
checkOneTerm(a, "puikām", "puik"); // dat. pl.
|
||||||
|
checkOneTerm(a, "puiku", "puik"); // acc. sing.
|
||||||
|
checkOneTerm(a, "puikas", "puik"); // acc. pl.
|
||||||
|
checkOneTerm(a, "puikā", "puik"); // loc. sing.
|
||||||
|
checkOneTerm(a, "puikās", "puik"); // loc. pl.
|
||||||
|
checkOneTerm(a, "puika", "puik"); // voc. sing.
|
||||||
|
checkOneTerm(a, "puikas", "puik"); // voc. pl.
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Genitive plural forms with (s,t) -> š and (d,z) -> ž
|
||||||
|
* will not conflate due to ambiguity.
|
||||||
|
*/
|
||||||
|
public void testNouns5() throws IOException {
|
||||||
|
// decl V
|
||||||
|
// l -> ļ palatalization
|
||||||
|
checkOneTerm(a, "egle", "egl"); // nom. sing.
|
||||||
|
checkOneTerm(a, "egles", "egl"); // nom. pl.
|
||||||
|
checkOneTerm(a, "egles", "egl"); // gen. sing.
|
||||||
|
checkOneTerm(a, "egļu", "egl"); // gen. pl.
|
||||||
|
checkOneTerm(a, "eglei", "egl"); // dat. sing.
|
||||||
|
checkOneTerm(a, "eglēm", "egl"); // dat. pl.
|
||||||
|
checkOneTerm(a, "egli", "egl"); // acc. sing.
|
||||||
|
checkOneTerm(a, "egles", "egl"); // acc. pl.
|
||||||
|
checkOneTerm(a, "eglē", "egl"); // loc. sing.
|
||||||
|
checkOneTerm(a, "eglēs", "egl"); // loc. pl.
|
||||||
|
checkOneTerm(a, "egle", "egl"); // voc. sing.
|
||||||
|
checkOneTerm(a, "egles", "egl"); // voc. pl.
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNouns6() throws IOException {
|
||||||
|
// decl VI
|
||||||
|
|
||||||
|
// no palatalization
|
||||||
|
checkOneTerm(a, "govs", "gov"); // nom. sing.
|
||||||
|
checkOneTerm(a, "govis", "gov"); // nom. pl.
|
||||||
|
checkOneTerm(a, "govs", "gov"); // gen. sing.
|
||||||
|
checkOneTerm(a, "govju", "gov"); // gen. pl.
|
||||||
|
checkOneTerm(a, "govij", "gov"); // dat. sing.
|
||||||
|
checkOneTerm(a, "govīm", "gov"); // dat. pl.
|
||||||
|
checkOneTerm(a, "govi ", "gov"); // acc. sing.
|
||||||
|
checkOneTerm(a, "govis", "gov"); // acc. pl.
|
||||||
|
checkOneTerm(a, "govi ", "gov"); // inst. sing.
|
||||||
|
checkOneTerm(a, "govīm", "gov"); // inst. pl.
|
||||||
|
checkOneTerm(a, "govī", "gov"); // loc. sing.
|
||||||
|
checkOneTerm(a, "govīs", "gov"); // loc. pl.
|
||||||
|
checkOneTerm(a, "govs", "gov"); // voc. sing.
|
||||||
|
checkOneTerm(a, "govis", "gov"); // voc. pl.
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testAdjectives() throws IOException {
|
||||||
|
checkOneTerm(a, "zils", "zil"); // indef. nom. masc. sing.
|
||||||
|
checkOneTerm(a, "zilais", "zil"); // def. nom. masc. sing.
|
||||||
|
checkOneTerm(a, "zili", "zil"); // indef. nom. masc. pl.
|
||||||
|
checkOneTerm(a, "zilie", "zil"); // def. nom. masc. pl.
|
||||||
|
checkOneTerm(a, "zila", "zil"); // indef. nom. fem. sing.
|
||||||
|
checkOneTerm(a, "zilā", "zil"); // def. nom. fem. sing.
|
||||||
|
checkOneTerm(a, "zilas", "zil"); // indef. nom. fem. pl.
|
||||||
|
checkOneTerm(a, "zilās", "zil"); // def. nom. fem. pl.
|
||||||
|
checkOneTerm(a, "zila", "zil"); // indef. gen. masc. sing.
|
||||||
|
checkOneTerm(a, "zilā", "zil"); // def. gen. masc. sing.
|
||||||
|
checkOneTerm(a, "zilu", "zil"); // indef. gen. masc. pl.
|
||||||
|
checkOneTerm(a, "zilo", "zil"); // def. gen. masc. pl.
|
||||||
|
checkOneTerm(a, "zilas", "zil"); // indef. gen. fem. sing.
|
||||||
|
checkOneTerm(a, "zilās", "zil"); // def. gen. fem. sing.
|
||||||
|
checkOneTerm(a, "zilu", "zil"); // indef. gen. fem. pl.
|
||||||
|
checkOneTerm(a, "zilo", "zil"); // def. gen. fem. pl.
|
||||||
|
checkOneTerm(a, "zilam", "zil"); // indef. dat. masc. sing.
|
||||||
|
checkOneTerm(a, "zilajam", "zil"); // def. dat. masc. sing.
|
||||||
|
checkOneTerm(a, "ziliem", "zil"); // indef. dat. masc. pl.
|
||||||
|
checkOneTerm(a, "zilajiem", "zil"); // def. dat. masc. pl.
|
||||||
|
checkOneTerm(a, "zilai", "zil"); // indef. dat. fem. sing.
|
||||||
|
checkOneTerm(a, "zilajai", "zil"); // def. dat. fem. sing.
|
||||||
|
checkOneTerm(a, "zilām", "zil"); // indef. dat. fem. pl.
|
||||||
|
checkOneTerm(a, "zilajām", "zil"); // def. dat. fem. pl.
|
||||||
|
checkOneTerm(a, "zilu", "zil"); // indef. acc. masc. sing.
|
||||||
|
checkOneTerm(a, "zilo", "zil"); // def. acc. masc. sing.
|
||||||
|
checkOneTerm(a, "zilus", "zil"); // indef. acc. masc. pl.
|
||||||
|
checkOneTerm(a, "zilos", "zil"); // def. acc. masc. pl.
|
||||||
|
checkOneTerm(a, "zilu", "zil"); // indef. acc. fem. sing.
|
||||||
|
checkOneTerm(a, "zilo", "zil"); // def. acc. fem. sing.
|
||||||
|
checkOneTerm(a, "zilās", "zil"); // indef. acc. fem. pl.
|
||||||
|
checkOneTerm(a, "zilās", "zil"); // def. acc. fem. pl.
|
||||||
|
checkOneTerm(a, "zilā", "zil"); // indef. loc. masc. sing.
|
||||||
|
checkOneTerm(a, "zilajā", "zil"); // def. loc. masc. sing.
|
||||||
|
checkOneTerm(a, "zilos", "zil"); // indef. loc. masc. pl.
|
||||||
|
checkOneTerm(a, "zilajos", "zil"); // def. loc. masc. pl.
|
||||||
|
checkOneTerm(a, "zilā", "zil"); // indef. loc. fem. sing.
|
||||||
|
checkOneTerm(a, "zilajā", "zil"); // def. loc. fem. sing.
|
||||||
|
checkOneTerm(a, "zilās", "zil"); // indef. loc. fem. pl.
|
||||||
|
checkOneTerm(a, "zilajās", "zil"); // def. loc. fem. pl.
|
||||||
|
checkOneTerm(a, "zilais", "zil"); // voc. masc. sing.
|
||||||
|
checkOneTerm(a, "zilie", "zil"); // voc. masc. pl.
|
||||||
|
checkOneTerm(a, "zilā", "zil"); // voc. fem. sing.
|
||||||
|
checkOneTerm(a, "zilās", "zil"); // voc. fem. pl.
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Note: we intentionally don't handle the ambiguous
|
||||||
|
* (s,t) -> š and (d,z) -> ž
|
||||||
|
*/
|
||||||
|
public void testPalatalization() throws IOException {
|
||||||
|
checkOneTerm(a, "krāsns", "krāsn"); // nom. sing.
|
||||||
|
checkOneTerm(a, "krāšņu", "krāsn"); // gen. pl.
|
||||||
|
checkOneTerm(a, "zvaigzne", "zvaigzn"); // nom. sing.
|
||||||
|
checkOneTerm(a, "zvaigžņu", "zvaigzn"); // gen. pl.
|
||||||
|
checkOneTerm(a, "kāpslis", "kāpsl"); // nom. sing.
|
||||||
|
checkOneTerm(a, "kāpšļu", "kāpsl"); // gen. pl.
|
||||||
|
checkOneTerm(a, "zizlis", "zizl"); // nom. sing.
|
||||||
|
checkOneTerm(a, "zižļu", "zizl"); // gen. pl.
|
||||||
|
checkOneTerm(a, "vilnis", "viln"); // nom. sing.
|
||||||
|
checkOneTerm(a, "viļņu", "viln"); // gen. pl.
|
||||||
|
checkOneTerm(a, "lelle", "lell"); // nom. sing.
|
||||||
|
checkOneTerm(a, "leļļu", "lell"); // gen. pl.
|
||||||
|
checkOneTerm(a, "pinne", "pinn"); // nom. sing.
|
||||||
|
checkOneTerm(a, "piņņu", "pinn"); // gen. pl.
|
||||||
|
checkOneTerm(a, "rīkste", "rīkst"); // nom. sing.
|
||||||
|
checkOneTerm(a, "rīkšu", "rīkst"); // gen. pl.
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test some length restrictions, we require a 3+ char stem,
|
||||||
|
* with at least one vowel.
|
||||||
|
*/
|
||||||
|
public void testLength() throws IOException {
|
||||||
|
checkOneTerm(a, "usa", "usa"); // length
|
||||||
|
checkOneTerm(a, "60ms", "60ms"); // vowel count
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,38 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.lv.LatvianStemFilter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link LatvianStemFilter}.
|
||||||
|
* <pre class="prettyprint" >
|
||||||
|
* <fieldType name="text_lvstem" class="solr.TextField" positionIncrementGap="100">
|
||||||
|
* <analyzer>
|
||||||
|
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
|
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
* <filter class="solr.LatvianStemFilterFactory"/>
|
||||||
|
* </analyzer>
|
||||||
|
* </fieldType></pre>
|
||||||
|
*/
|
||||||
|
public class LatvianStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new LatvianStemFilter(input);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the Latvian stem factory is working.
|
||||||
|
*/
|
||||||
|
public class TestLatvianStemFilterFactory extends BaseTokenTestCase {
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("tirgiem tirgus");
|
||||||
|
LatvianStemFilterFactory factory = new LatvianStemFilterFactory();
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream, new String[] { "tirg", "tirg" });
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue