mirror of https://github.com/apache/lucene.git
LUCENE-3016: add analyzer for Latvian
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1092396 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9b8cfb80b5
commit
c3f6331639
|
@ -50,6 +50,10 @@ Bug fixes
|
|||
* LUCENE-3026: SmartChineseAnalyzer's WordTokenFilter threw NullPointerException
|
||||
on sentences longer than 32,767 characters. (wangzhenghang via Robert Muir)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-3016: Add analyzer for Latvian. (Robert Muir)
|
||||
|
||||
======================= Lucene 3.1.0 =======================
|
||||
|
||||
Changes in backwards compatibility policy
|
||||
|
|
|
@ -0,0 +1,129 @@
|
|||
package org.apache.lucene.analysis.lv;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Latvian.
|
||||
*/
|
||||
public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
|
||||
/** File containing default Latvian stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(LatvianAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public LatvianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public LatvianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public LatvianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a
|
||||
* {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
|
||||
* which tokenizes all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return A
|
||||
* {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link LatvianStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new LatvianStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
package org.apache.lucene.analysis.lv;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that applies {@link LatvianStemmer} to stem Latvian
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
public final class LatvianStemFilter extends TokenFilter {
|
||||
private final LatvianStemmer stemmer = new LatvianStemmer();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
|
||||
public LatvianStemFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if (!keywordAttr.isKeyword()) {
|
||||
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||
termAtt.setLength(newlen);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,174 @@
|
|||
package org.apache.lucene.analysis.lv;
|
||||
|
||||
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Light stemmer for Latvian.
|
||||
* <p>
|
||||
* This is a light version of the algorithm in Karlis Kreslin's PhD thesis
|
||||
* <i>A stemming algorithm for Latvian</i> with the following modifications:
|
||||
* <ul>
|
||||
* <li>Only explicitly stems noun and adjective morphology
|
||||
* <li>Stricter length/vowel checks for the resulting stems (verb etc suffix stripping is removed)
|
||||
* <li>Removes only the primary inflectional suffixes: case and number for nouns ;
|
||||
* case, number, gender, and definitiveness for adjectives.
|
||||
* <li>Palatalization is only handled when a declension II,V,VI noun suffix is removed.
|
||||
* </ul>
|
||||
*/
|
||||
public class LatvianStemmer {
|
||||
/**
|
||||
* Stem a latvian word. returns the new adjusted length.
|
||||
*/
|
||||
public int stem(char s[], int len) {
|
||||
int numVowels = numVowels(s, len);
|
||||
|
||||
for (int i = 0; i < affixes.length; i++) {
|
||||
Affix affix = affixes[i];
|
||||
if (numVowels > affix.vc && len >= affix.affix.length + 3 && endsWith(s, len, affix.affix)) {
|
||||
len -= affix.affix.length;
|
||||
return affix.palatalizes ? unpalatalize(s, len) : len;
|
||||
}
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
static final Affix affixes[] = {
|
||||
new Affix("ajiem", 3, false), new Affix("ajai", 3, false),
|
||||
new Affix("ajam", 2, false), new Affix("ajām", 2, false),
|
||||
new Affix("ajos", 2, false), new Affix("ajās", 2, false),
|
||||
new Affix("iem", 2, true), new Affix("ajā", 2, false),
|
||||
new Affix("ais", 2, false), new Affix("ai", 2, false),
|
||||
new Affix("ei", 2, false), new Affix("ām", 1, false),
|
||||
new Affix("am", 1, false), new Affix("ēm", 1, false),
|
||||
new Affix("īm", 1, false), new Affix("im", 1, false),
|
||||
new Affix("um", 1, false), new Affix("us", 1, true),
|
||||
new Affix("as", 1, false), new Affix("ās", 1, false),
|
||||
new Affix("es", 1, false), new Affix("os", 1, true),
|
||||
new Affix("ij", 1, false), new Affix("īs", 1, false),
|
||||
new Affix("ēs", 1, false), new Affix("is", 1, false),
|
||||
new Affix("ie", 1, false), new Affix("u", 1, true),
|
||||
new Affix("a", 1, true), new Affix("i", 1, true),
|
||||
new Affix("e", 1, false), new Affix("ā", 1, false),
|
||||
new Affix("ē", 1, false), new Affix("ī", 1, false),
|
||||
new Affix("ū", 1, false), new Affix("o", 1, false),
|
||||
new Affix("s", 0, false), new Affix("š", 0, false),
|
||||
};
|
||||
|
||||
static class Affix {
|
||||
char affix[]; // suffix
|
||||
int vc; // vowel count of the suffix
|
||||
boolean palatalizes; // true if we should fire palatalization rules.
|
||||
|
||||
Affix(String affix, int vc, boolean palatalizes) {
|
||||
this.affix = affix.toCharArray();
|
||||
this.vc = vc;
|
||||
this.palatalizes = palatalizes;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Most cases are handled except for the ambiguous ones:
|
||||
* <ul>
|
||||
* <li> s -> š
|
||||
* <li> t -> š
|
||||
* <li> d -> ž
|
||||
* <li> z -> ž
|
||||
* </ul>
|
||||
*/
|
||||
private int unpalatalize(char s[], int len) {
|
||||
// we check the character removed: if its -u then
|
||||
// its 2,5, or 6 gen pl., and these two can only apply then.
|
||||
if (s[len] == 'u') {
|
||||
// kš -> kst
|
||||
if (endsWith(s, len, "kš")) {
|
||||
len++;
|
||||
s[len-2] = 's';
|
||||
s[len-1] = 't';
|
||||
return len;
|
||||
}
|
||||
// ņņ -> nn
|
||||
if (endsWith(s, len, "ņņ")) {
|
||||
s[len-2] = 'n';
|
||||
s[len-1] = 'n';
|
||||
return len;
|
||||
}
|
||||
}
|
||||
|
||||
// otherwise all other rules
|
||||
if (endsWith(s, len, "pj") || endsWith(s, len, "bj")
|
||||
|| endsWith(s, len, "mj") || endsWith(s, len, "vj")) {
|
||||
// labial consonant
|
||||
return len-1;
|
||||
} else if (endsWith(s, len, "šņ")) {
|
||||
s[len-2] = 's';
|
||||
s[len-1] = 'n';
|
||||
return len;
|
||||
} else if (endsWith(s, len, "žņ")) {
|
||||
s[len-2] = 'z';
|
||||
s[len-1] = 'n';
|
||||
return len;
|
||||
} else if (endsWith(s, len, "šļ")) {
|
||||
s[len-2] = 's';
|
||||
s[len-1] = 'l';
|
||||
return len;
|
||||
} else if (endsWith(s, len, "žļ")) {
|
||||
s[len-2] = 'z';
|
||||
s[len-1] = 'l';
|
||||
return len;
|
||||
} else if (endsWith(s, len, "ļņ")) {
|
||||
s[len-2] = 'l';
|
||||
s[len-1] = 'n';
|
||||
return len;
|
||||
} else if (endsWith(s, len, "ļļ")) {
|
||||
s[len-2] = 'l';
|
||||
s[len-1] = 'l';
|
||||
return len;
|
||||
} else if (s[len-1] == 'č') {
|
||||
s[len-1] = 'c';
|
||||
return len;
|
||||
} else if (s[len-1] == 'ļ') {
|
||||
s[len-1] = 'l';
|
||||
return len;
|
||||
} else if (s[len-1] == 'ņ') {
|
||||
s[len-1] = 'n';
|
||||
return len;
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Count the vowels in the string, we always require at least
|
||||
* one in the remaining stem to accept it.
|
||||
*/
|
||||
private int numVowels(char s[], int len) {
|
||||
int n = 0;
|
||||
for (int i = 0; i < len; i++) {
|
||||
switch(s[i]) {
|
||||
case 'a': case 'e': case 'i':
|
||||
case 'o': case 'u': case 'ā':
|
||||
case 'ī': case 'ē': case 'ū':
|
||||
n++;
|
||||
}
|
||||
}
|
||||
return n;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Analyzer for Latvian.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,172 @@
|
|||
# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins
|
||||
# the original list of over 800 forms was refined:
|
||||
# pronouns, adverbs, interjections were removed
|
||||
#
|
||||
# prepositions
|
||||
aiz
|
||||
ap
|
||||
ar
|
||||
apakš
|
||||
ārpus
|
||||
augšpus
|
||||
bez
|
||||
caur
|
||||
dēļ
|
||||
gar
|
||||
iekš
|
||||
iz
|
||||
kopš
|
||||
labad
|
||||
lejpus
|
||||
līdz
|
||||
no
|
||||
otrpus
|
||||
pa
|
||||
par
|
||||
pār
|
||||
pēc
|
||||
pie
|
||||
pirms
|
||||
pret
|
||||
priekš
|
||||
starp
|
||||
šaipus
|
||||
uz
|
||||
viņpus
|
||||
virs
|
||||
virspus
|
||||
zem
|
||||
apakšpus
|
||||
# Conjunctions
|
||||
un
|
||||
bet
|
||||
jo
|
||||
ja
|
||||
ka
|
||||
lai
|
||||
tomēr
|
||||
tikko
|
||||
turpretī
|
||||
arī
|
||||
kaut
|
||||
gan
|
||||
tādēļ
|
||||
tā
|
||||
ne
|
||||
tikvien
|
||||
vien
|
||||
kā
|
||||
ir
|
||||
te
|
||||
vai
|
||||
kamēr
|
||||
# Particles
|
||||
ar
|
||||
diezin
|
||||
droši
|
||||
diemžēl
|
||||
nebūt
|
||||
ik
|
||||
it
|
||||
taču
|
||||
nu
|
||||
pat
|
||||
tiklab
|
||||
iekšpus
|
||||
nedz
|
||||
tik
|
||||
nevis
|
||||
turpretim
|
||||
jeb
|
||||
iekam
|
||||
iekām
|
||||
iekāms
|
||||
kolīdz
|
||||
līdzko
|
||||
tiklīdz
|
||||
jebšu
|
||||
tālab
|
||||
tāpēc
|
||||
nekā
|
||||
itin
|
||||
jā
|
||||
jau
|
||||
jel
|
||||
nē
|
||||
nezin
|
||||
tad
|
||||
tikai
|
||||
vis
|
||||
tak
|
||||
iekams
|
||||
vien
|
||||
# modal verbs
|
||||
būt
|
||||
biju
|
||||
biji
|
||||
bija
|
||||
bijām
|
||||
bijāt
|
||||
esmu
|
||||
esi
|
||||
esam
|
||||
esat
|
||||
būšu
|
||||
būsi
|
||||
būs
|
||||
būsim
|
||||
būsiet
|
||||
tikt
|
||||
tiku
|
||||
tiki
|
||||
tika
|
||||
tikām
|
||||
tikāt
|
||||
tieku
|
||||
tiec
|
||||
tiek
|
||||
tiekam
|
||||
tiekat
|
||||
tikšu
|
||||
tiks
|
||||
tiksim
|
||||
tiksiet
|
||||
tapt
|
||||
tapi
|
||||
tapāt
|
||||
topat
|
||||
tapšu
|
||||
tapsi
|
||||
taps
|
||||
tapsim
|
||||
tapsiet
|
||||
kļūt
|
||||
kļuvu
|
||||
kļuvi
|
||||
kļuva
|
||||
kļuvām
|
||||
kļuvāt
|
||||
kļūstu
|
||||
kļūsti
|
||||
kļūst
|
||||
kļūstam
|
||||
kļūstat
|
||||
kļūšu
|
||||
kļūsi
|
||||
kļūs
|
||||
kļūsim
|
||||
kļūsiet
|
||||
# verbs
|
||||
varēt
|
||||
varēju
|
||||
varējām
|
||||
varēšu
|
||||
varēsim
|
||||
var
|
||||
varēji
|
||||
varējāt
|
||||
varēsi
|
||||
varēsiet
|
||||
varat
|
||||
varēja
|
||||
varēs
|
|
@ -0,0 +1,53 @@
|
|||
package org.apache.lucene.analysis.lv;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
||||
public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
* stopwords file is missing in classpath */
|
||||
public void testResourcesAvailable() {
|
||||
new LatvianAnalyzer(TEST_VERSION_CURRENT);
|
||||
}
|
||||
|
||||
/** test stopwords and stemming */
|
||||
public void testBasics() throws IOException {
|
||||
Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "tirgiem", "tirg");
|
||||
checkOneTermReuse(a, "tirgus", "tirg");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "un", new String[] {});
|
||||
}
|
||||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("tirgiem");
|
||||
Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT,
|
||||
LatvianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "tirgiem", "tirgiem");
|
||||
checkOneTermReuse(a, "tirgus", "tirg");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,272 @@
|
|||
package org.apache.lucene.analysis.lv;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
/**
|
||||
* Basic tests for {@link LatvianStemmer}
|
||||
*/
|
||||
public class TestLatvianStemmer extends BaseTokenStreamTestCase {
|
||||
private Analyzer a = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new TokenStreamComponents(tokenizer, new LatvianStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
|
||||
public void testNouns1() throws IOException {
|
||||
// decl. I
|
||||
checkOneTerm(a, "tēvs", "tēv"); // nom. sing.
|
||||
checkOneTerm(a, "tēvi", "tēv"); // nom. pl.
|
||||
checkOneTerm(a, "tēva", "tēv"); // gen. sing.
|
||||
checkOneTerm(a, "tēvu", "tēv"); // gen. pl.
|
||||
checkOneTerm(a, "tēvam", "tēv"); // dat. sing.
|
||||
checkOneTerm(a, "tēviem", "tēv"); // dat. pl.
|
||||
checkOneTerm(a, "tēvu", "tēv"); // acc. sing.
|
||||
checkOneTerm(a, "tēvus", "tēv"); // acc. pl.
|
||||
checkOneTerm(a, "tēvā", "tēv"); // loc. sing.
|
||||
checkOneTerm(a, "tēvos", "tēv"); // loc. pl.
|
||||
checkOneTerm(a, "tēvs", "tēv"); // voc. sing.
|
||||
checkOneTerm(a, "tēvi", "tēv"); // voc. pl.
|
||||
}
|
||||
|
||||
/**
|
||||
* decl II nouns with (s,t) -> š and (d,z) -> ž
|
||||
* palatalization will generally conflate to two stems
|
||||
* due to the ambiguity (plural and singular).
|
||||
*/
|
||||
public void testNouns2() throws IOException {
|
||||
// decl. II
|
||||
|
||||
// c -> č palatalization
|
||||
checkOneTerm(a, "lācis", "lāc"); // nom. sing.
|
||||
checkOneTerm(a, "lāči", "lāc"); // nom. pl.
|
||||
checkOneTerm(a, "lāča", "lāc"); // gen. sing.
|
||||
checkOneTerm(a, "lāču", "lāc"); // gen. pl.
|
||||
checkOneTerm(a, "lācim", "lāc"); // dat. sing.
|
||||
checkOneTerm(a, "lāčiem", "lāc"); // dat. pl.
|
||||
checkOneTerm(a, "lāci", "lāc"); // acc. sing.
|
||||
checkOneTerm(a, "lāčus", "lāc"); // acc. pl.
|
||||
checkOneTerm(a, "lācī", "lāc"); // loc. sing.
|
||||
checkOneTerm(a, "lāčos", "lāc"); // loc. pl.
|
||||
checkOneTerm(a, "lāci", "lāc"); // voc. sing.
|
||||
checkOneTerm(a, "lāči", "lāc"); // voc. pl.
|
||||
|
||||
// n -> ņ palatalization
|
||||
checkOneTerm(a, "akmens", "akmen"); // nom. sing.
|
||||
checkOneTerm(a, "akmeņi", "akmen"); // nom. pl.
|
||||
checkOneTerm(a, "akmens", "akmen"); // gen. sing.
|
||||
checkOneTerm(a, "akmeņu", "akmen"); // gen. pl.
|
||||
checkOneTerm(a, "akmenim", "akmen"); // dat. sing.
|
||||
checkOneTerm(a, "akmeņiem", "akmen"); // dat. pl.
|
||||
checkOneTerm(a, "akmeni", "akmen"); // acc. sing.
|
||||
checkOneTerm(a, "akmeņus", "akmen"); // acc. pl.
|
||||
checkOneTerm(a, "akmenī", "akmen"); // loc. sing.
|
||||
checkOneTerm(a, "akmeņos", "akmen"); // loc. pl.
|
||||
checkOneTerm(a, "akmens", "akmen"); // voc. sing.
|
||||
checkOneTerm(a, "akmeņi", "akmen"); // voc. pl.
|
||||
|
||||
// no palatalization
|
||||
checkOneTerm(a, "kurmis", "kurm"); // nom. sing.
|
||||
checkOneTerm(a, "kurmji", "kurm"); // nom. pl.
|
||||
checkOneTerm(a, "kurmja", "kurm"); // gen. sing.
|
||||
checkOneTerm(a, "kurmju", "kurm"); // gen. pl.
|
||||
checkOneTerm(a, "kurmim", "kurm"); // dat. sing.
|
||||
checkOneTerm(a, "kurmjiem", "kurm"); // dat. pl.
|
||||
checkOneTerm(a, "kurmi", "kurm"); // acc. sing.
|
||||
checkOneTerm(a, "kurmjus", "kurm"); // acc. pl.
|
||||
checkOneTerm(a, "kurmī", "kurm"); // loc. sing.
|
||||
checkOneTerm(a, "kurmjos", "kurm"); // loc. pl.
|
||||
checkOneTerm(a, "kurmi", "kurm"); // voc. sing.
|
||||
checkOneTerm(a, "kurmji", "kurm"); // voc. pl.
|
||||
}
|
||||
|
||||
public void testNouns3() throws IOException {
|
||||
// decl III
|
||||
checkOneTerm(a, "lietus", "liet"); // nom. sing.
|
||||
checkOneTerm(a, "lieti", "liet"); // nom. pl.
|
||||
checkOneTerm(a, "lietus", "liet"); // gen. sing.
|
||||
checkOneTerm(a, "lietu", "liet"); // gen. pl.
|
||||
checkOneTerm(a, "lietum", "liet"); // dat. sing.
|
||||
checkOneTerm(a, "lietiem", "liet"); // dat. pl.
|
||||
checkOneTerm(a, "lietu", "liet"); // acc. sing.
|
||||
checkOneTerm(a, "lietus", "liet"); // acc. pl.
|
||||
checkOneTerm(a, "lietū", "liet"); // loc. sing.
|
||||
checkOneTerm(a, "lietos", "liet"); // loc. pl.
|
||||
checkOneTerm(a, "lietus", "liet"); // voc. sing.
|
||||
checkOneTerm(a, "lieti", "liet"); // voc. pl.
|
||||
}
|
||||
|
||||
public void testNouns4() throws IOException {
|
||||
// decl IV
|
||||
checkOneTerm(a, "lapa", "lap"); // nom. sing.
|
||||
checkOneTerm(a, "lapas", "lap"); // nom. pl.
|
||||
checkOneTerm(a, "lapas", "lap"); // gen. sing.
|
||||
checkOneTerm(a, "lapu", "lap"); // gen. pl.
|
||||
checkOneTerm(a, "lapai", "lap"); // dat. sing.
|
||||
checkOneTerm(a, "lapām", "lap"); // dat. pl.
|
||||
checkOneTerm(a, "lapu", "lap"); // acc. sing.
|
||||
checkOneTerm(a, "lapas", "lap"); // acc. pl.
|
||||
checkOneTerm(a, "lapā", "lap"); // loc. sing.
|
||||
checkOneTerm(a, "lapās", "lap"); // loc. pl.
|
||||
checkOneTerm(a, "lapa", "lap"); // voc. sing.
|
||||
checkOneTerm(a, "lapas", "lap"); // voc. pl.
|
||||
|
||||
checkOneTerm(a, "puika", "puik"); // nom. sing.
|
||||
checkOneTerm(a, "puikas", "puik"); // nom. pl.
|
||||
checkOneTerm(a, "puikas", "puik"); // gen. sing.
|
||||
checkOneTerm(a, "puiku", "puik"); // gen. pl.
|
||||
checkOneTerm(a, "puikam", "puik"); // dat. sing.
|
||||
checkOneTerm(a, "puikām", "puik"); // dat. pl.
|
||||
checkOneTerm(a, "puiku", "puik"); // acc. sing.
|
||||
checkOneTerm(a, "puikas", "puik"); // acc. pl.
|
||||
checkOneTerm(a, "puikā", "puik"); // loc. sing.
|
||||
checkOneTerm(a, "puikās", "puik"); // loc. pl.
|
||||
checkOneTerm(a, "puika", "puik"); // voc. sing.
|
||||
checkOneTerm(a, "puikas", "puik"); // voc. pl.
|
||||
}
|
||||
|
||||
/**
|
||||
* Genitive plural forms with (s,t) -> š and (d,z) -> ž
|
||||
* will not conflate due to ambiguity.
|
||||
*/
|
||||
public void testNouns5() throws IOException {
|
||||
// decl V
|
||||
// l -> ļ palatalization
|
||||
checkOneTerm(a, "egle", "egl"); // nom. sing.
|
||||
checkOneTerm(a, "egles", "egl"); // nom. pl.
|
||||
checkOneTerm(a, "egles", "egl"); // gen. sing.
|
||||
checkOneTerm(a, "egļu", "egl"); // gen. pl.
|
||||
checkOneTerm(a, "eglei", "egl"); // dat. sing.
|
||||
checkOneTerm(a, "eglēm", "egl"); // dat. pl.
|
||||
checkOneTerm(a, "egli", "egl"); // acc. sing.
|
||||
checkOneTerm(a, "egles", "egl"); // acc. pl.
|
||||
checkOneTerm(a, "eglē", "egl"); // loc. sing.
|
||||
checkOneTerm(a, "eglēs", "egl"); // loc. pl.
|
||||
checkOneTerm(a, "egle", "egl"); // voc. sing.
|
||||
checkOneTerm(a, "egles", "egl"); // voc. pl.
|
||||
}
|
||||
|
||||
public void testNouns6() throws IOException {
|
||||
// decl VI
|
||||
|
||||
// no palatalization
|
||||
checkOneTerm(a, "govs", "gov"); // nom. sing.
|
||||
checkOneTerm(a, "govis", "gov"); // nom. pl.
|
||||
checkOneTerm(a, "govs", "gov"); // gen. sing.
|
||||
checkOneTerm(a, "govju", "gov"); // gen. pl.
|
||||
checkOneTerm(a, "govij", "gov"); // dat. sing.
|
||||
checkOneTerm(a, "govīm", "gov"); // dat. pl.
|
||||
checkOneTerm(a, "govi ", "gov"); // acc. sing.
|
||||
checkOneTerm(a, "govis", "gov"); // acc. pl.
|
||||
checkOneTerm(a, "govi ", "gov"); // inst. sing.
|
||||
checkOneTerm(a, "govīm", "gov"); // inst. pl.
|
||||
checkOneTerm(a, "govī", "gov"); // loc. sing.
|
||||
checkOneTerm(a, "govīs", "gov"); // loc. pl.
|
||||
checkOneTerm(a, "govs", "gov"); // voc. sing.
|
||||
checkOneTerm(a, "govis", "gov"); // voc. pl.
|
||||
}
|
||||
|
||||
public void testAdjectives() throws IOException {
|
||||
checkOneTerm(a, "zils", "zil"); // indef. nom. masc. sing.
|
||||
checkOneTerm(a, "zilais", "zil"); // def. nom. masc. sing.
|
||||
checkOneTerm(a, "zili", "zil"); // indef. nom. masc. pl.
|
||||
checkOneTerm(a, "zilie", "zil"); // def. nom. masc. pl.
|
||||
checkOneTerm(a, "zila", "zil"); // indef. nom. fem. sing.
|
||||
checkOneTerm(a, "zilā", "zil"); // def. nom. fem. sing.
|
||||
checkOneTerm(a, "zilas", "zil"); // indef. nom. fem. pl.
|
||||
checkOneTerm(a, "zilās", "zil"); // def. nom. fem. pl.
|
||||
checkOneTerm(a, "zila", "zil"); // indef. gen. masc. sing.
|
||||
checkOneTerm(a, "zilā", "zil"); // def. gen. masc. sing.
|
||||
checkOneTerm(a, "zilu", "zil"); // indef. gen. masc. pl.
|
||||
checkOneTerm(a, "zilo", "zil"); // def. gen. masc. pl.
|
||||
checkOneTerm(a, "zilas", "zil"); // indef. gen. fem. sing.
|
||||
checkOneTerm(a, "zilās", "zil"); // def. gen. fem. sing.
|
||||
checkOneTerm(a, "zilu", "zil"); // indef. gen. fem. pl.
|
||||
checkOneTerm(a, "zilo", "zil"); // def. gen. fem. pl.
|
||||
checkOneTerm(a, "zilam", "zil"); // indef. dat. masc. sing.
|
||||
checkOneTerm(a, "zilajam", "zil"); // def. dat. masc. sing.
|
||||
checkOneTerm(a, "ziliem", "zil"); // indef. dat. masc. pl.
|
||||
checkOneTerm(a, "zilajiem", "zil"); // def. dat. masc. pl.
|
||||
checkOneTerm(a, "zilai", "zil"); // indef. dat. fem. sing.
|
||||
checkOneTerm(a, "zilajai", "zil"); // def. dat. fem. sing.
|
||||
checkOneTerm(a, "zilām", "zil"); // indef. dat. fem. pl.
|
||||
checkOneTerm(a, "zilajām", "zil"); // def. dat. fem. pl.
|
||||
checkOneTerm(a, "zilu", "zil"); // indef. acc. masc. sing.
|
||||
checkOneTerm(a, "zilo", "zil"); // def. acc. masc. sing.
|
||||
checkOneTerm(a, "zilus", "zil"); // indef. acc. masc. pl.
|
||||
checkOneTerm(a, "zilos", "zil"); // def. acc. masc. pl.
|
||||
checkOneTerm(a, "zilu", "zil"); // indef. acc. fem. sing.
|
||||
checkOneTerm(a, "zilo", "zil"); // def. acc. fem. sing.
|
||||
checkOneTerm(a, "zilās", "zil"); // indef. acc. fem. pl.
|
||||
checkOneTerm(a, "zilās", "zil"); // def. acc. fem. pl.
|
||||
checkOneTerm(a, "zilā", "zil"); // indef. loc. masc. sing.
|
||||
checkOneTerm(a, "zilajā", "zil"); // def. loc. masc. sing.
|
||||
checkOneTerm(a, "zilos", "zil"); // indef. loc. masc. pl.
|
||||
checkOneTerm(a, "zilajos", "zil"); // def. loc. masc. pl.
|
||||
checkOneTerm(a, "zilā", "zil"); // indef. loc. fem. sing.
|
||||
checkOneTerm(a, "zilajā", "zil"); // def. loc. fem. sing.
|
||||
checkOneTerm(a, "zilās", "zil"); // indef. loc. fem. pl.
|
||||
checkOneTerm(a, "zilajās", "zil"); // def. loc. fem. pl.
|
||||
checkOneTerm(a, "zilais", "zil"); // voc. masc. sing.
|
||||
checkOneTerm(a, "zilie", "zil"); // voc. masc. pl.
|
||||
checkOneTerm(a, "zilā", "zil"); // voc. fem. sing.
|
||||
checkOneTerm(a, "zilās", "zil"); // voc. fem. pl.
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: we intentionally don't handle the ambiguous
|
||||
* (s,t) -> š and (d,z) -> ž
|
||||
*/
|
||||
public void testPalatalization() throws IOException {
|
||||
checkOneTerm(a, "krāsns", "krāsn"); // nom. sing.
|
||||
checkOneTerm(a, "krāšņu", "krāsn"); // gen. pl.
|
||||
checkOneTerm(a, "zvaigzne", "zvaigzn"); // nom. sing.
|
||||
checkOneTerm(a, "zvaigžņu", "zvaigzn"); // gen. pl.
|
||||
checkOneTerm(a, "kāpslis", "kāpsl"); // nom. sing.
|
||||
checkOneTerm(a, "kāpšļu", "kāpsl"); // gen. pl.
|
||||
checkOneTerm(a, "zizlis", "zizl"); // nom. sing.
|
||||
checkOneTerm(a, "zižļu", "zizl"); // gen. pl.
|
||||
checkOneTerm(a, "vilnis", "viln"); // nom. sing.
|
||||
checkOneTerm(a, "viļņu", "viln"); // gen. pl.
|
||||
checkOneTerm(a, "lelle", "lell"); // nom. sing.
|
||||
checkOneTerm(a, "leļļu", "lell"); // gen. pl.
|
||||
checkOneTerm(a, "pinne", "pinn"); // nom. sing.
|
||||
checkOneTerm(a, "piņņu", "pinn"); // gen. pl.
|
||||
checkOneTerm(a, "rīkste", "rīkst"); // nom. sing.
|
||||
checkOneTerm(a, "rīkšu", "rīkst"); // gen. pl.
|
||||
}
|
||||
|
||||
/**
|
||||
* Test some length restrictions, we require a 3+ char stem,
|
||||
* with at least one vowel.
|
||||
*/
|
||||
public void testLength() throws IOException {
|
||||
checkOneTerm(a, "usa", "usa"); // length
|
||||
checkOneTerm(a, "60ms", "60ms"); // vowel count
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.lv.LatvianStemFilter;
|
||||
|
||||
/**
|
||||
* Factory for {@link LatvianStemFilter}.
|
||||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_lvstem" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||
* <filter class="solr.LatvianStemFilterFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class LatvianStemFilterFactory extends BaseTokenFilterFactory {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new LatvianStemFilter(input);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Latvian stem factory is working.
|
||||
*/
|
||||
public class TestLatvianStemFilterFactory extends BaseTokenTestCase {
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("tirgiem tirgus");
|
||||
LatvianStemFilterFactory factory = new LatvianStemFilterFactory();
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||
assertTokenStreamContents(stream, new String[] { "tirg", "tirg" });
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue