LUCENE-3016: add analyzer for Latvian

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1092396 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-04-14 17:07:10 +00:00
parent 9b8cfb80b5
commit c3f6331639
10 changed files with 958 additions and 0 deletions

View File

@ -50,6 +50,10 @@ Bug fixes
* LUCENE-3026: SmartChineseAnalyzer's WordTokenFilter threw NullPointerException * LUCENE-3026: SmartChineseAnalyzer's WordTokenFilter threw NullPointerException
on sentences longer than 32,767 characters. (wangzhenghang via Robert Muir) on sentences longer than 32,767 characters. (wangzhenghang via Robert Muir)
New Features
* LUCENE-3016: Add analyzer for Latvian. (Robert Muir)
======================= Lucene 3.1.0 ======================= ======================= Lucene 3.1.0 =======================
Changes in backwards compatibility policy Changes in backwards compatibility policy

View File

@ -0,0 +1,129 @@
package org.apache.lucene.analysis.lv;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.Version;
/**
* {@link Analyzer} for Latvian.
*/
public final class LatvianAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet;
/** File containing default Latvian stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
* Returns an unmodifiable instance of the default stop words set.
* @return default stop words set.
*/
public static Set<?> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
/**
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getWordSet(LatvianAnalyzer.class,
DEFAULT_STOPWORD_FILE);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
}
/**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/
public LatvianAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Builds an analyzer with the given stop words.
*
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set
*/
public LatvianAnalyzer(Version matchVersion, Set<?> stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
}
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed
*/
public LatvianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
}
/**
* Creates a
* {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
* which tokenizes all the text in the provided {@link Reader}.
*
* @return A
* {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* provided and {@link LatvianStemFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new LatvianStemFilter(result);
return new TokenStreamComponents(source, result);
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.lv;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link LatvianStemmer} to stem Latvian
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class LatvianStemFilter extends TokenFilter {
private final LatvianStemmer stemmer = new LatvianStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public LatvianStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,174 @@
package org.apache.lucene.analysis.lv;
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Light stemmer for Latvian.
* <p>
* This is a light version of the algorithm in Karlis Kreslin's PhD thesis
* <i>A stemming algorithm for Latvian</i> with the following modifications:
* <ul>
* <li>Only explicitly stems noun and adjective morphology
* <li>Stricter length/vowel checks for the resulting stems (verb etc suffix stripping is removed)
* <li>Removes only the primary inflectional suffixes: case and number for nouns ;
* case, number, gender, and definitiveness for adjectives.
* <li>Palatalization is only handled when a declension II,V,VI noun suffix is removed.
* </ul>
*/
public class LatvianStemmer {
/**
* Stem a latvian word. returns the new adjusted length.
*/
public int stem(char s[], int len) {
int numVowels = numVowels(s, len);
for (int i = 0; i < affixes.length; i++) {
Affix affix = affixes[i];
if (numVowels > affix.vc && len >= affix.affix.length + 3 && endsWith(s, len, affix.affix)) {
len -= affix.affix.length;
return affix.palatalizes ? unpalatalize(s, len) : len;
}
}
return len;
}
static final Affix affixes[] = {
new Affix("ajiem", 3, false), new Affix("ajai", 3, false),
new Affix("ajam", 2, false), new Affix("ajām", 2, false),
new Affix("ajos", 2, false), new Affix("ajās", 2, false),
new Affix("iem", 2, true), new Affix("ajā", 2, false),
new Affix("ais", 2, false), new Affix("ai", 2, false),
new Affix("ei", 2, false), new Affix("ām", 1, false),
new Affix("am", 1, false), new Affix("ēm", 1, false),
new Affix("īm", 1, false), new Affix("im", 1, false),
new Affix("um", 1, false), new Affix("us", 1, true),
new Affix("as", 1, false), new Affix("ās", 1, false),
new Affix("es", 1, false), new Affix("os", 1, true),
new Affix("ij", 1, false), new Affix("īs", 1, false),
new Affix("ēs", 1, false), new Affix("is", 1, false),
new Affix("ie", 1, false), new Affix("u", 1, true),
new Affix("a", 1, true), new Affix("i", 1, true),
new Affix("e", 1, false), new Affix("ā", 1, false),
new Affix("ē", 1, false), new Affix("ī", 1, false),
new Affix("ū", 1, false), new Affix("o", 1, false),
new Affix("s", 0, false), new Affix("š", 0, false),
};
static class Affix {
char affix[]; // suffix
int vc; // vowel count of the suffix
boolean palatalizes; // true if we should fire palatalization rules.
Affix(String affix, int vc, boolean palatalizes) {
this.affix = affix.toCharArray();
this.vc = vc;
this.palatalizes = palatalizes;
}
}
/**
* Most cases are handled except for the ambiguous ones:
* <ul>
* <li> s -> š
* <li> t -> š
* <li> d -> ž
* <li> z -> ž
* </ul>
*/
private int unpalatalize(char s[], int len) {
// we check the character removed: if its -u then
// its 2,5, or 6 gen pl., and these two can only apply then.
if (s[len] == 'u') {
// -> kst
if (endsWith(s, len, "")) {
len++;
s[len-2] = 's';
s[len-1] = 't';
return len;
}
// ņņ -> nn
if (endsWith(s, len, "ņņ")) {
s[len-2] = 'n';
s[len-1] = 'n';
return len;
}
}
// otherwise all other rules
if (endsWith(s, len, "pj") || endsWith(s, len, "bj")
|| endsWith(s, len, "mj") || endsWith(s, len, "vj")) {
// labial consonant
return len-1;
} else if (endsWith(s, len, "šņ")) {
s[len-2] = 's';
s[len-1] = 'n';
return len;
} else if (endsWith(s, len, "žņ")) {
s[len-2] = 'z';
s[len-1] = 'n';
return len;
} else if (endsWith(s, len, "šļ")) {
s[len-2] = 's';
s[len-1] = 'l';
return len;
} else if (endsWith(s, len, "žļ")) {
s[len-2] = 'z';
s[len-1] = 'l';
return len;
} else if (endsWith(s, len, "ļņ")) {
s[len-2] = 'l';
s[len-1] = 'n';
return len;
} else if (endsWith(s, len, "ļļ")) {
s[len-2] = 'l';
s[len-1] = 'l';
return len;
} else if (s[len-1] == 'č') {
s[len-1] = 'c';
return len;
} else if (s[len-1] == 'ļ') {
s[len-1] = 'l';
return len;
} else if (s[len-1] == 'ņ') {
s[len-1] = 'n';
return len;
}
return len;
}
/**
* Count the vowels in the string, we always require at least
* one in the remaining stem to accept it.
*/
private int numVowels(char s[], int len) {
int n = 0;
for (int i = 0; i < len; i++) {
switch(s[i]) {
case 'a': case 'e': case 'i':
case 'o': case 'u': case 'ā':
case 'ī': case 'ē': case 'ū':
n++;
}
}
return n;
}
}

View File

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Analyzer for Latvian.
</body>
</html>

View File

@ -0,0 +1,172 @@
# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins
# the original list of over 800 forms was refined:
# pronouns, adverbs, interjections were removed
#
# prepositions
aiz
ap
ar
apakš
ārpus
augšpus
bez
caur
dēļ
gar
iekš
iz
kopš
labad
lejpus
līdz
no
otrpus
pa
par
pār
pēc
pie
pirms
pret
priekš
starp
šaipus
uz
viņpus
virs
virspus
zem
apakšpus
# Conjunctions
un
bet
jo
ja
ka
lai
tomēr
tikko
turpretī
arī
kaut
gan
tādēļ
ne
tikvien
vien
ir
te
vai
kamēr
# Particles
ar
diezin
droši
diemžēl
nebūt
ik
it
taču
nu
pat
tiklab
iekšpus
nedz
tik
nevis
turpretim
jeb
iekam
iekām
iekāms
kolīdz
līdzko
tiklīdz
jebšu
tālab
tāpēc
nekā
itin
jau
jel
nezin
tad
tikai
vis
tak
iekams
vien
# modal verbs
būt
biju
biji
bija
bijām
bijāt
esmu
esi
esam
esat
būšu
būsi
būs
būsim
būsiet
tikt
tiku
tiki
tika
tikām
tikāt
tieku
tiec
tiek
tiekam
tiekat
tikšu
tiks
tiksim
tiksiet
tapt
tapi
tapāt
topat
tapšu
tapsi
taps
tapsim
tapsiet
kļūt
kļuvu
kļuvi
kļuva
kļuvām
kļuvāt
kļūstu
kļūsti
kļūst
kļūstam
kļūstat
kļūšu
kļūsi
kļūs
kļūsim
kļūsiet
# verbs
varēt
varēju
varējām
varēšu
varēsim
var
varēji
varējāt
varēsi
varēsiet
varat
varēja
varēs

View File

@ -0,0 +1,53 @@
package org.apache.lucene.analysis.lv;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
* stopwords file is missing in classpath */
public void testResourcesAvailable() {
new LatvianAnalyzer(TEST_VERSION_CURRENT);
}
/** test stopwords and stemming */
public void testBasics() throws IOException {
Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "tirgiem", "tirg");
checkOneTermReuse(a, "tirgus", "tirg");
// stopword
assertAnalyzesTo(a, "un", new String[] {});
}
/** test use of exclusion set */
public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>();
exclusionSet.add("tirgiem");
Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT,
LatvianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "tirgiem", "tirgiem");
checkOneTermReuse(a, "tirgus", "tirg");
}
}

View File

@ -0,0 +1,272 @@
package org.apache.lucene.analysis.lv;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
/**
* Basic tests for {@link LatvianStemmer}
*/
public class TestLatvianStemmer extends BaseTokenStreamTestCase {
private Analyzer a = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(tokenizer, new LatvianStemFilter(tokenizer));
}
};
public void testNouns1() throws IOException {
// decl. I
checkOneTerm(a, "tēvs", "tēv"); // nom. sing.
checkOneTerm(a, "tēvi", "tēv"); // nom. pl.
checkOneTerm(a, "tēva", "tēv"); // gen. sing.
checkOneTerm(a, "tēvu", "tēv"); // gen. pl.
checkOneTerm(a, "tēvam", "tēv"); // dat. sing.
checkOneTerm(a, "tēviem", "tēv"); // dat. pl.
checkOneTerm(a, "tēvu", "tēv"); // acc. sing.
checkOneTerm(a, "tēvus", "tēv"); // acc. pl.
checkOneTerm(a, "tēvā", "tēv"); // loc. sing.
checkOneTerm(a, "tēvos", "tēv"); // loc. pl.
checkOneTerm(a, "tēvs", "tēv"); // voc. sing.
checkOneTerm(a, "tēvi", "tēv"); // voc. pl.
}
/**
* decl II nouns with (s,t) -> š and (d,z) -> ž
* palatalization will generally conflate to two stems
* due to the ambiguity (plural and singular).
*/
public void testNouns2() throws IOException {
// decl. II
// c -> č palatalization
checkOneTerm(a, "lācis", "lāc"); // nom. sing.
checkOneTerm(a, "lāči", "lāc"); // nom. pl.
checkOneTerm(a, "lāča", "lāc"); // gen. sing.
checkOneTerm(a, "lāču", "lāc"); // gen. pl.
checkOneTerm(a, "lācim", "lāc"); // dat. sing.
checkOneTerm(a, "lāčiem", "lāc"); // dat. pl.
checkOneTerm(a, "lāci", "lāc"); // acc. sing.
checkOneTerm(a, "lāčus", "lāc"); // acc. pl.
checkOneTerm(a, "lācī", "lāc"); // loc. sing.
checkOneTerm(a, "lāčos", "lāc"); // loc. pl.
checkOneTerm(a, "lāci", "lāc"); // voc. sing.
checkOneTerm(a, "lāči", "lāc"); // voc. pl.
// n -> ņ palatalization
checkOneTerm(a, "akmens", "akmen"); // nom. sing.
checkOneTerm(a, "akmeņi", "akmen"); // nom. pl.
checkOneTerm(a, "akmens", "akmen"); // gen. sing.
checkOneTerm(a, "akmeņu", "akmen"); // gen. pl.
checkOneTerm(a, "akmenim", "akmen"); // dat. sing.
checkOneTerm(a, "akmeņiem", "akmen"); // dat. pl.
checkOneTerm(a, "akmeni", "akmen"); // acc. sing.
checkOneTerm(a, "akmeņus", "akmen"); // acc. pl.
checkOneTerm(a, "akmenī", "akmen"); // loc. sing.
checkOneTerm(a, "akmeņos", "akmen"); // loc. pl.
checkOneTerm(a, "akmens", "akmen"); // voc. sing.
checkOneTerm(a, "akmeņi", "akmen"); // voc. pl.
// no palatalization
checkOneTerm(a, "kurmis", "kurm"); // nom. sing.
checkOneTerm(a, "kurmji", "kurm"); // nom. pl.
checkOneTerm(a, "kurmja", "kurm"); // gen. sing.
checkOneTerm(a, "kurmju", "kurm"); // gen. pl.
checkOneTerm(a, "kurmim", "kurm"); // dat. sing.
checkOneTerm(a, "kurmjiem", "kurm"); // dat. pl.
checkOneTerm(a, "kurmi", "kurm"); // acc. sing.
checkOneTerm(a, "kurmjus", "kurm"); // acc. pl.
checkOneTerm(a, "kurmī", "kurm"); // loc. sing.
checkOneTerm(a, "kurmjos", "kurm"); // loc. pl.
checkOneTerm(a, "kurmi", "kurm"); // voc. sing.
checkOneTerm(a, "kurmji", "kurm"); // voc. pl.
}
public void testNouns3() throws IOException {
// decl III
checkOneTerm(a, "lietus", "liet"); // nom. sing.
checkOneTerm(a, "lieti", "liet"); // nom. pl.
checkOneTerm(a, "lietus", "liet"); // gen. sing.
checkOneTerm(a, "lietu", "liet"); // gen. pl.
checkOneTerm(a, "lietum", "liet"); // dat. sing.
checkOneTerm(a, "lietiem", "liet"); // dat. pl.
checkOneTerm(a, "lietu", "liet"); // acc. sing.
checkOneTerm(a, "lietus", "liet"); // acc. pl.
checkOneTerm(a, "lietū", "liet"); // loc. sing.
checkOneTerm(a, "lietos", "liet"); // loc. pl.
checkOneTerm(a, "lietus", "liet"); // voc. sing.
checkOneTerm(a, "lieti", "liet"); // voc. pl.
}
public void testNouns4() throws IOException {
// decl IV
checkOneTerm(a, "lapa", "lap"); // nom. sing.
checkOneTerm(a, "lapas", "lap"); // nom. pl.
checkOneTerm(a, "lapas", "lap"); // gen. sing.
checkOneTerm(a, "lapu", "lap"); // gen. pl.
checkOneTerm(a, "lapai", "lap"); // dat. sing.
checkOneTerm(a, "lapām", "lap"); // dat. pl.
checkOneTerm(a, "lapu", "lap"); // acc. sing.
checkOneTerm(a, "lapas", "lap"); // acc. pl.
checkOneTerm(a, "lapā", "lap"); // loc. sing.
checkOneTerm(a, "lapās", "lap"); // loc. pl.
checkOneTerm(a, "lapa", "lap"); // voc. sing.
checkOneTerm(a, "lapas", "lap"); // voc. pl.
checkOneTerm(a, "puika", "puik"); // nom. sing.
checkOneTerm(a, "puikas", "puik"); // nom. pl.
checkOneTerm(a, "puikas", "puik"); // gen. sing.
checkOneTerm(a, "puiku", "puik"); // gen. pl.
checkOneTerm(a, "puikam", "puik"); // dat. sing.
checkOneTerm(a, "puikām", "puik"); // dat. pl.
checkOneTerm(a, "puiku", "puik"); // acc. sing.
checkOneTerm(a, "puikas", "puik"); // acc. pl.
checkOneTerm(a, "puikā", "puik"); // loc. sing.
checkOneTerm(a, "puikās", "puik"); // loc. pl.
checkOneTerm(a, "puika", "puik"); // voc. sing.
checkOneTerm(a, "puikas", "puik"); // voc. pl.
}
/**
* Genitive plural forms with (s,t) -> š and (d,z) -> ž
* will not conflate due to ambiguity.
*/
public void testNouns5() throws IOException {
// decl V
// l -> ļ palatalization
checkOneTerm(a, "egle", "egl"); // nom. sing.
checkOneTerm(a, "egles", "egl"); // nom. pl.
checkOneTerm(a, "egles", "egl"); // gen. sing.
checkOneTerm(a, "egļu", "egl"); // gen. pl.
checkOneTerm(a, "eglei", "egl"); // dat. sing.
checkOneTerm(a, "eglēm", "egl"); // dat. pl.
checkOneTerm(a, "egli", "egl"); // acc. sing.
checkOneTerm(a, "egles", "egl"); // acc. pl.
checkOneTerm(a, "eglē", "egl"); // loc. sing.
checkOneTerm(a, "eglēs", "egl"); // loc. pl.
checkOneTerm(a, "egle", "egl"); // voc. sing.
checkOneTerm(a, "egles", "egl"); // voc. pl.
}
public void testNouns6() throws IOException {
// decl VI
// no palatalization
checkOneTerm(a, "govs", "gov"); // nom. sing.
checkOneTerm(a, "govis", "gov"); // nom. pl.
checkOneTerm(a, "govs", "gov"); // gen. sing.
checkOneTerm(a, "govju", "gov"); // gen. pl.
checkOneTerm(a, "govij", "gov"); // dat. sing.
checkOneTerm(a, "govīm", "gov"); // dat. pl.
checkOneTerm(a, "govi ", "gov"); // acc. sing.
checkOneTerm(a, "govis", "gov"); // acc. pl.
checkOneTerm(a, "govi ", "gov"); // inst. sing.
checkOneTerm(a, "govīm", "gov"); // inst. pl.
checkOneTerm(a, "govī", "gov"); // loc. sing.
checkOneTerm(a, "govīs", "gov"); // loc. pl.
checkOneTerm(a, "govs", "gov"); // voc. sing.
checkOneTerm(a, "govis", "gov"); // voc. pl.
}
public void testAdjectives() throws IOException {
checkOneTerm(a, "zils", "zil"); // indef. nom. masc. sing.
checkOneTerm(a, "zilais", "zil"); // def. nom. masc. sing.
checkOneTerm(a, "zili", "zil"); // indef. nom. masc. pl.
checkOneTerm(a, "zilie", "zil"); // def. nom. masc. pl.
checkOneTerm(a, "zila", "zil"); // indef. nom. fem. sing.
checkOneTerm(a, "zilā", "zil"); // def. nom. fem. sing.
checkOneTerm(a, "zilas", "zil"); // indef. nom. fem. pl.
checkOneTerm(a, "zilās", "zil"); // def. nom. fem. pl.
checkOneTerm(a, "zila", "zil"); // indef. gen. masc. sing.
checkOneTerm(a, "zilā", "zil"); // def. gen. masc. sing.
checkOneTerm(a, "zilu", "zil"); // indef. gen. masc. pl.
checkOneTerm(a, "zilo", "zil"); // def. gen. masc. pl.
checkOneTerm(a, "zilas", "zil"); // indef. gen. fem. sing.
checkOneTerm(a, "zilās", "zil"); // def. gen. fem. sing.
checkOneTerm(a, "zilu", "zil"); // indef. gen. fem. pl.
checkOneTerm(a, "zilo", "zil"); // def. gen. fem. pl.
checkOneTerm(a, "zilam", "zil"); // indef. dat. masc. sing.
checkOneTerm(a, "zilajam", "zil"); // def. dat. masc. sing.
checkOneTerm(a, "ziliem", "zil"); // indef. dat. masc. pl.
checkOneTerm(a, "zilajiem", "zil"); // def. dat. masc. pl.
checkOneTerm(a, "zilai", "zil"); // indef. dat. fem. sing.
checkOneTerm(a, "zilajai", "zil"); // def. dat. fem. sing.
checkOneTerm(a, "zilām", "zil"); // indef. dat. fem. pl.
checkOneTerm(a, "zilajām", "zil"); // def. dat. fem. pl.
checkOneTerm(a, "zilu", "zil"); // indef. acc. masc. sing.
checkOneTerm(a, "zilo", "zil"); // def. acc. masc. sing.
checkOneTerm(a, "zilus", "zil"); // indef. acc. masc. pl.
checkOneTerm(a, "zilos", "zil"); // def. acc. masc. pl.
checkOneTerm(a, "zilu", "zil"); // indef. acc. fem. sing.
checkOneTerm(a, "zilo", "zil"); // def. acc. fem. sing.
checkOneTerm(a, "zilās", "zil"); // indef. acc. fem. pl.
checkOneTerm(a, "zilās", "zil"); // def. acc. fem. pl.
checkOneTerm(a, "zilā", "zil"); // indef. loc. masc. sing.
checkOneTerm(a, "zilajā", "zil"); // def. loc. masc. sing.
checkOneTerm(a, "zilos", "zil"); // indef. loc. masc. pl.
checkOneTerm(a, "zilajos", "zil"); // def. loc. masc. pl.
checkOneTerm(a, "zilā", "zil"); // indef. loc. fem. sing.
checkOneTerm(a, "zilajā", "zil"); // def. loc. fem. sing.
checkOneTerm(a, "zilās", "zil"); // indef. loc. fem. pl.
checkOneTerm(a, "zilajās", "zil"); // def. loc. fem. pl.
checkOneTerm(a, "zilais", "zil"); // voc. masc. sing.
checkOneTerm(a, "zilie", "zil"); // voc. masc. pl.
checkOneTerm(a, "zilā", "zil"); // voc. fem. sing.
checkOneTerm(a, "zilās", "zil"); // voc. fem. pl.
}
/**
* Note: we intentionally don't handle the ambiguous
* (s,t) -> š and (d,z) -> ž
*/
public void testPalatalization() throws IOException {
checkOneTerm(a, "krāsns", "krāsn"); // nom. sing.
checkOneTerm(a, "krāšņu", "krāsn"); // gen. pl.
checkOneTerm(a, "zvaigzne", "zvaigzn"); // nom. sing.
checkOneTerm(a, "zvaigžņu", "zvaigzn"); // gen. pl.
checkOneTerm(a, "kāpslis", "kāpsl"); // nom. sing.
checkOneTerm(a, "kāpšļu", "kāpsl"); // gen. pl.
checkOneTerm(a, "zizlis", "zizl"); // nom. sing.
checkOneTerm(a, "zižļu", "zizl"); // gen. pl.
checkOneTerm(a, "vilnis", "viln"); // nom. sing.
checkOneTerm(a, "viļņu", "viln"); // gen. pl.
checkOneTerm(a, "lelle", "lell"); // nom. sing.
checkOneTerm(a, "leļļu", "lell"); // gen. pl.
checkOneTerm(a, "pinne", "pinn"); // nom. sing.
checkOneTerm(a, "piņņu", "pinn"); // gen. pl.
checkOneTerm(a, "rīkste", "rīkst"); // nom. sing.
checkOneTerm(a, "rīkšu", "rīkst"); // gen. pl.
}
/**
* Test some length restrictions, we require a 3+ char stem,
* with at least one vowel.
*/
public void testLength() throws IOException {
checkOneTerm(a, "usa", "usa"); // length
checkOneTerm(a, "60ms", "60ms"); // vowel count
}
}

View File

@ -0,0 +1,38 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.lv.LatvianStemFilter;
/**
* Factory for {@link LatvianStemFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_lvstem" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.LatvianStemFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class LatvianStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new LatvianStemFilter(input);
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Simple tests to ensure the Latvian stem factory is working.
*/
public class TestLatvianStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("tirgiem tirgus");
LatvianStemFilterFactory factory = new LatvianStemFilterFactory();
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream, new String[] { "tirg", "tirg" });
}
}