LUCENE-3016: add analyzer for Latvian

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1092396 13f79535-47bb-0310-9956-ffa450edef68
2011-04-14 17:07:10 +00:00 · 2011-04-14 17:07:10 +00:00 · c3f6331639
parent 9b8cfb80b5
commit c3f6331639
10 changed files with 958 additions and 0 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -50,6 +50,10 @@ Bug fixes
 * LUCENE-3026: SmartChineseAnalyzer's WordTokenFilter threw NullPointerException
   on sentences longer than 32,767 characters.  (wangzhenghang via Robert Muir)
 New Features
 * LUCENE-3016: Add analyzer for Latvian.  (Robert Muir)
 ======================= Lucene 3.1.0 =======================
 Changes in backwards compatibility policy
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
@ -0,0 +1,129 @@
 package org.apache.lucene.analysis.lv;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.io.Reader;
 import java.util.Set;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
 import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
 import org.apache.lucene.util.Version;
 /**
 * {@link Analyzer} for Latvian.
 */
 public final class LatvianAnalyzer extends StopwordAnalyzerBase {
  private final Set<?> stemExclusionSet;
  /** File containing default Latvian stopwords. */
  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
  /**
   * Returns an unmodifiable instance of the default stop words set.
   * @return default stop words set.
   */
  public static Set<?> getDefaultStopSet(){
    return DefaultSetHolder.DEFAULT_STOP_SET;
  }
  /**
   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
   * accesses the static final set the first time.;
   */
  private static class DefaultSetHolder {
    static final Set<?> DEFAULT_STOP_SET;
    static {
      try {
        DEFAULT_STOP_SET = WordlistLoader.getWordSet(LatvianAnalyzer.class, 
            DEFAULT_STOPWORD_FILE);
      } catch (IOException ex) {
        // default set should always be present as it is part of the
        // distribution (JAR)
        throw new RuntimeException("Unable to load default stopword set");
      }
    }
  }
  /**
   * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
   */
  public LatvianAnalyzer(Version matchVersion) {
    this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
  }
  /**
   * Builds an analyzer with the given stop words.
   * 
   * @param matchVersion lucene compatibility version
   * @param stopwords a stopword set
   */
  public LatvianAnalyzer(Version matchVersion, Set<?> stopwords) {
    this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
  }
  /**
   * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
   * provided this analyzer will add a {@link KeywordMarkerFilter} before
   * stemming.
   * 
   * @param matchVersion lucene compatibility version
   * @param stopwords a stopword set
   * @param stemExclusionSet a set of terms not to be stemmed
   */
  public LatvianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
    super(matchVersion, stopwords);
    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
        matchVersion, stemExclusionSet));
  }
  /**
   * Creates a
   * {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
   * which tokenizes all the text in the provided {@link Reader}.
   * 
   * @return A
   *         {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
   *         built from an {@link StandardTokenizer} filtered with
   *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
   *         , {@link KeywordMarkerFilter} if a stem exclusion set is
   *         provided and {@link LatvianStemFilter}.
   */
  @Override
  protected TokenStreamComponents createComponents(String fieldName,
      Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter(matchVersion, result, stopwords);
    if(!stemExclusionSet.isEmpty())
      result = new KeywordMarkerFilter(result, stemExclusionSet);
    result = new LatvianStemFilter(result);
    return new TokenStreamComponents(source, result);
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java
@ -0,0 +1,58 @@
 package org.apache.lucene.analysis.lv;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 /**
 * A {@link TokenFilter} that applies {@link LatvianStemmer} to stem Latvian
 * words.
 * <p>
 * To prevent terms from being stemmed use an instance of
 * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
 * the {@link KeywordAttribute} before this {@link TokenStream}.
 * </p>
 */
 public final class LatvianStemFilter extends TokenFilter {
  private final LatvianStemmer stemmer = new LatvianStemmer();
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
  public LatvianStemFilter(TokenStream input) {
    super(input);
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
      if (!keywordAttr.isKeyword()) {
        final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
        termAtt.setLength(newlen);
      }
      return true;
    } else {
      return false;
    }
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemmer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemmer.java
@ -0,0 +1,174 @@
 package org.apache.lucene.analysis.lv;
 import static org.apache.lucene.analysis.util.StemmerUtil.*;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /**
 * Light stemmer for Latvian.
 * <p>
 * This is a light version of the algorithm in Karlis Kreslin's PhD thesis
 * <i>A stemming algorithm for Latvian</i> with the following modifications:
 * <ul>
 *   <li>Only explicitly stems noun and adjective morphology
 *   <li>Stricter length/vowel checks for the resulting stems (verb etc suffix stripping is removed)
 *   <li>Removes only the primary inflectional suffixes: case and number for nouns ; 
 *       case, number, gender, and definitiveness for adjectives.
 *   <li>Palatalization is only handled when a declension II,V,VI noun suffix is removed.
 * </ul>
 */
 public class LatvianStemmer {
  /**
   * Stem a latvian word. returns the new adjusted length.
   */
  public int stem(char s[], int len) {
    int numVowels = numVowels(s, len);
    for (int i = 0; i < affixes.length; i++) {
      Affix affix = affixes[i];
      if (numVowels > affix.vc && len >= affix.affix.length + 3 && endsWith(s, len, affix.affix)) {
        len -= affix.affix.length;
        return affix.palatalizes ? unpalatalize(s, len) : len;
      }
    }
    return len;
  }
  static final Affix affixes[] = {
    new Affix("ajiem", 3, false), new Affix("ajai",  3, false), 
    new Affix("ajam",  2, false), new Affix("ajām",  2, false),
    new Affix("ajos",  2, false), new Affix("ajās",  2, false),
    new Affix("iem",   2, true),  new Affix("ajā",   2, false),
    new Affix("ais",   2, false), new Affix("ai",    2, false),
    new Affix("ei",    2, false), new Affix("ām",    1, false),
    new Affix("am",    1, false), new Affix("ēm",    1, false),
    new Affix("īm",    1, false), new Affix("im",    1, false),
    new Affix("um",    1, false), new Affix("us",    1, true),
    new Affix("as",    1, false), new Affix("ās",    1, false),
    new Affix("es",    1, false), new Affix("os",    1, true),
    new Affix("ij",    1, false), new Affix("īs",    1, false),
    new Affix("ēs",    1, false), new Affix("is",    1, false),
    new Affix("ie",    1, false), new Affix("u",     1, true),
    new Affix("a",     1, true),  new Affix("i",     1, true),
    new Affix("e",     1, false), new Affix("ā",     1, false),
    new Affix("ē",     1, false), new Affix("ī",     1, false),
    new Affix("ū",     1, false), new Affix("o",     1, false),
    new Affix("s",     0, false), new Affix("š",     0, false),
  };
  static class Affix {
    char affix[];         // suffix
    int vc;               // vowel count of the suffix
    boolean palatalizes;  // true if we should fire palatalization rules.
    Affix(String affix, int vc, boolean palatalizes) {
      this.affix = affix.toCharArray();
      this.vc = vc;
      this.palatalizes = palatalizes;
    }
  }
  /**
   * Most cases are handled except for the ambiguous ones:
   * <ul>
   *  <li> s -> š
   *  <li> t -> š
   *  <li> d -> ž
   *  <li> z -> ž
   * </ul>
   */
  private int unpalatalize(char s[], int len) {
    // we check the character removed: if its -u then 
    // its 2,5, or 6 gen pl., and these two can only apply then.
    if (s[len] == 'u') {
      // kš -> kst
      if (endsWith(s, len, "kš")) {
        len++;
        s[len-2] = 's';
        s[len-1] = 't';
        return len;
      }
      // ņņ -> nn
      if (endsWith(s, len, "ņņ")) {
        s[len-2] = 'n';
        s[len-1] = 'n';
        return len;
      }
    }
    // otherwise all other rules
    if (endsWith(s, len, "pj") || endsWith(s, len, "bj") 
        || endsWith(s, len, "mj") || endsWith(s, len, "vj")) {
      // labial consonant
      return len-1;
    } else if (endsWith(s, len, "šņ")) {
      s[len-2] = 's';
      s[len-1] = 'n';
      return len;
    } else if (endsWith(s, len, "žņ")) {
      s[len-2] = 'z';
      s[len-1] = 'n';
      return len;
    } else if (endsWith(s, len, "šļ")) {
      s[len-2] = 's';
      s[len-1] = 'l';
      return len;
    } else if (endsWith(s, len, "žļ")) {
      s[len-2] = 'z';
      s[len-1] = 'l';
      return len;
    } else if (endsWith(s, len, "ļņ")) {
      s[len-2] = 'l';
      s[len-1] = 'n';
      return len;
    } else if (endsWith(s, len, "ļļ")) {
      s[len-2] = 'l';
      s[len-1] = 'l';
      return len;
    } else if (s[len-1] == 'č') {
      s[len-1] = 'c';
      return len;
    } else if (s[len-1] == 'ļ') {
      s[len-1] = 'l';
      return len;
    } else if (s[len-1] == 'ņ') {
      s[len-1] = 'n';
      return len;
    }
    return len;
  }
  /**
   * Count the vowels in the string, we always require at least
   * one in the remaining stem to accept it.
   */
  private int numVowels(char s[], int len) {
    int n = 0;
    for (int i = 0; i < len; i++) {
      switch(s[i]) {
        case 'a': case 'e': case 'i':  
        case 'o': case 'u': case 'ā':  
        case 'ī': case 'ē': case 'ū':
          n++;
      }
    }
    return n;
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/package.html
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/package.html
@ -0,0 +1,22 @@
 <!doctype html public "-//w3c//dtd html 4.0 transitional//en">
 <!--
 Licensed to the Apache Software Foundation (ASF) under one or more
 contributor license agreements.  See the NOTICE file distributed with
 this work for additional information regarding copyright ownership.
 The ASF licenses this file to You under the Apache License, Version 2.0
 (the "License"); you may not use this file except in compliance with
 the License.  You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
 <html><head></head>
 <body>
 Analyzer for Latvian.
 </body>
 </html>
--- a/modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt
+++ b/modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt
@ -0,0 +1,172 @@
 # Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins
 # the original list of over 800 forms was refined: 
 #   pronouns, adverbs, interjections were removed
 # 
 # prepositions
 aiz
 ap
 ar
 apakš
 ārpus
 augšpus
 bez
 caur
 dēļ
 gar
 iekš
 iz
 kopš
 labad
 lejpus
 līdz
 no
 otrpus
 pa
 par
 pār
 pēc
 pie
 pirms
 pret
 priekš
 starp
 šaipus
 uz
 viņpus
 virs
 virspus
 zem
 apakšpus
 # Conjunctions
 un
 bet
 jo
 ja
 ka
 lai
 tomēr
 tikko
 turpretī
 arī
 kaut
 gan
 tādēļ
 tā
 ne
 tikvien
 vien
 kā
 ir
 te
 vai
 kamēr
 # Particles
 ar
 diezin
 droši
 diemžēl
 nebūt
 ik
 it
 taču
 nu
 pat
 tiklab
 iekšpus
 nedz
 tik
 nevis
 turpretim
 jeb
 iekam
 iekām
 iekāms
 kolīdz
 līdzko
 tiklīdz
 jebšu
 tālab
 tāpēc
 nekā
 itin
 jā
 jau
 jel
 nē
 nezin
 tad
 tikai
 vis
 tak
 iekams
 vien
 # modal verbs
 būt  
 biju 
 biji
 bija
 bijām
 bijāt
 esmu
 esi
 esam
 esat 
 būšu     
 būsi
 būs
 būsim
 būsiet
 tikt
 tiku
 tiki
 tika
 tikām
 tikāt
 tieku
 tiec
 tiek
 tiekam
 tiekat
 tikšu
 tiks
 tiksim
 tiksiet
 tapt
 tapi
 tapāt
 topat
 tapšu
 tapsi
 taps
 tapsim
 tapsiet
 kļūt
 kļuvu
 kļuvi
 kļuva
 kļuvām
 kļuvāt
 kļūstu
 kļūsti
 kļūst
 kļūstam
 kļūstat
 kļūšu
 kļūsi
 kļūs
 kļūsim
 kļūsiet
 # verbs
 varēt
 varēju
 varējām
 varēšu
 varēsim
 var
 varēji
 varējāt
 varēsi
 varēsiet
 varat
 varēja
 varēs
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java
@ -0,0 +1,53 @@
 package org.apache.lucene.analysis.lv;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.util.HashSet;
 import java.util.Set;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
  /** This test fails with NPE when the 
   * stopwords file is missing in classpath */
  public void testResourcesAvailable() {
    new LatvianAnalyzer(TEST_VERSION_CURRENT);
  }
  /** test stopwords and stemming */
  public void testBasics() throws IOException {
    Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT);
    // stemming
    checkOneTermReuse(a, "tirgiem", "tirg");
    checkOneTermReuse(a, "tirgus", "tirg");
    // stopword
    assertAnalyzesTo(a, "un", new String[] {});
  }
  /** test use of exclusion set */
  public void testExclude() throws IOException {
    Set<String> exclusionSet = new HashSet<String>();
    exclusionSet.add("tirgiem");
    Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT, 
        LatvianAnalyzer.getDefaultStopSet(), exclusionSet);
    checkOneTermReuse(a, "tirgiem", "tirgiem");
    checkOneTermReuse(a, "tirgus", "tirg");
  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java
@ -0,0 +1,272 @@
 package org.apache.lucene.analysis.lv;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.io.Reader;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
 /**
 * Basic tests for {@link LatvianStemmer}
 */
 public class TestLatvianStemmer extends BaseTokenStreamTestCase {
  private Analyzer a = new ReusableAnalyzerBase() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
      return new TokenStreamComponents(tokenizer, new LatvianStemFilter(tokenizer));
    }
  };
  public void testNouns1() throws IOException {
    // decl. I
    checkOneTerm(a, "tēvs",   "tēv"); // nom. sing.
    checkOneTerm(a, "tēvi",   "tēv"); // nom. pl.
    checkOneTerm(a, "tēva",   "tēv"); // gen. sing.
    checkOneTerm(a, "tēvu",   "tēv"); // gen. pl.
    checkOneTerm(a, "tēvam",  "tēv"); // dat. sing.
    checkOneTerm(a, "tēviem", "tēv"); // dat. pl.
    checkOneTerm(a, "tēvu",   "tēv"); // acc. sing.
    checkOneTerm(a, "tēvus",  "tēv"); // acc. pl.
    checkOneTerm(a, "tēvā",   "tēv"); // loc. sing.
    checkOneTerm(a, "tēvos",  "tēv"); // loc. pl.
    checkOneTerm(a, "tēvs",   "tēv"); // voc. sing.
    checkOneTerm(a, "tēvi",   "tēv"); // voc. pl.
  }
  /**
   * decl II nouns with (s,t) -> š and (d,z) -> ž
   * palatalization will generally conflate to two stems
   * due to the ambiguity (plural and singular).
   */
  public void testNouns2() throws IOException {
    // decl. II
    // c -> č palatalization
    checkOneTerm(a, "lācis",  "lāc"); // nom. sing.
    checkOneTerm(a, "lāči",   "lāc"); // nom. pl.
    checkOneTerm(a, "lāča",   "lāc"); // gen. sing.
    checkOneTerm(a, "lāču",   "lāc"); // gen. pl.
    checkOneTerm(a, "lācim",  "lāc"); // dat. sing.
    checkOneTerm(a, "lāčiem", "lāc"); // dat. pl.
    checkOneTerm(a, "lāci",   "lāc"); // acc. sing.
    checkOneTerm(a, "lāčus",  "lāc"); // acc. pl.
    checkOneTerm(a, "lācī",   "lāc"); // loc. sing.
    checkOneTerm(a, "lāčos",  "lāc"); // loc. pl.
    checkOneTerm(a, "lāci",   "lāc"); // voc. sing.
    checkOneTerm(a, "lāči",   "lāc"); // voc. pl.
    // n -> ņ palatalization
    checkOneTerm(a, "akmens",   "akmen"); // nom. sing.
    checkOneTerm(a, "akmeņi",   "akmen"); // nom. pl.
    checkOneTerm(a, "akmens",   "akmen"); // gen. sing.
    checkOneTerm(a, "akmeņu",   "akmen"); // gen. pl.
    checkOneTerm(a, "akmenim",  "akmen"); // dat. sing.
    checkOneTerm(a, "akmeņiem", "akmen"); // dat. pl.
    checkOneTerm(a, "akmeni",   "akmen"); // acc. sing.
    checkOneTerm(a, "akmeņus",  "akmen"); // acc. pl.
    checkOneTerm(a, "akmenī",   "akmen"); // loc. sing.
    checkOneTerm(a, "akmeņos",  "akmen"); // loc. pl.
    checkOneTerm(a, "akmens",   "akmen"); // voc. sing.
    checkOneTerm(a, "akmeņi",   "akmen"); // voc. pl.
    // no palatalization
    checkOneTerm(a, "kurmis",   "kurm"); // nom. sing.
    checkOneTerm(a, "kurmji",   "kurm"); // nom. pl.
    checkOneTerm(a, "kurmja",   "kurm"); // gen. sing.
    checkOneTerm(a, "kurmju",   "kurm"); // gen. pl.
    checkOneTerm(a, "kurmim",   "kurm"); // dat. sing.
    checkOneTerm(a, "kurmjiem", "kurm"); // dat. pl.
    checkOneTerm(a, "kurmi",    "kurm"); // acc. sing.
    checkOneTerm(a, "kurmjus",  "kurm"); // acc. pl.
    checkOneTerm(a, "kurmī",    "kurm"); // loc. sing.
    checkOneTerm(a, "kurmjos",  "kurm"); // loc. pl.
    checkOneTerm(a, "kurmi",    "kurm"); // voc. sing.
    checkOneTerm(a, "kurmji",   "kurm"); // voc. pl.
  }
  public void testNouns3() throws IOException {
    // decl III
    checkOneTerm(a, "lietus",  "liet"); // nom. sing.
    checkOneTerm(a, "lieti",   "liet"); // nom. pl.
    checkOneTerm(a, "lietus",  "liet"); // gen. sing.
    checkOneTerm(a, "lietu",   "liet"); // gen. pl.
    checkOneTerm(a, "lietum",  "liet"); // dat. sing.
    checkOneTerm(a, "lietiem", "liet"); // dat. pl.
    checkOneTerm(a, "lietu",   "liet"); // acc. sing.
    checkOneTerm(a, "lietus",  "liet"); // acc. pl.
    checkOneTerm(a, "lietū",   "liet"); // loc. sing.
    checkOneTerm(a, "lietos",  "liet"); // loc. pl.
    checkOneTerm(a, "lietus",  "liet"); // voc. sing.
    checkOneTerm(a, "lieti",   "liet"); // voc. pl.
  }
  public void testNouns4() throws IOException {
    // decl IV
    checkOneTerm(a, "lapa",  "lap"); // nom. sing.
    checkOneTerm(a, "lapas", "lap"); // nom. pl.
    checkOneTerm(a, "lapas", "lap"); // gen. sing.
    checkOneTerm(a, "lapu",  "lap"); // gen. pl.
    checkOneTerm(a, "lapai", "lap"); // dat. sing.
    checkOneTerm(a, "lapām", "lap"); // dat. pl.
    checkOneTerm(a, "lapu",  "lap"); // acc. sing.
    checkOneTerm(a, "lapas", "lap"); // acc. pl.
    checkOneTerm(a, "lapā",  "lap"); // loc. sing.
    checkOneTerm(a, "lapās", "lap"); // loc. pl.
    checkOneTerm(a, "lapa",  "lap"); // voc. sing.
    checkOneTerm(a, "lapas", "lap"); // voc. pl.
    checkOneTerm(a, "puika",  "puik"); // nom. sing.
    checkOneTerm(a, "puikas", "puik"); // nom. pl.
    checkOneTerm(a, "puikas", "puik"); // gen. sing.
    checkOneTerm(a, "puiku",  "puik"); // gen. pl.
    checkOneTerm(a, "puikam", "puik"); // dat. sing.
    checkOneTerm(a, "puikām", "puik"); // dat. pl.
    checkOneTerm(a, "puiku",  "puik"); // acc. sing.
    checkOneTerm(a, "puikas", "puik"); // acc. pl.
    checkOneTerm(a, "puikā",  "puik"); // loc. sing.
    checkOneTerm(a, "puikās", "puik"); // loc. pl.
    checkOneTerm(a, "puika",  "puik"); // voc. sing.
    checkOneTerm(a, "puikas", "puik"); // voc. pl.
  }
  /**
   * Genitive plural forms with (s,t) -> š and (d,z) -> ž
   * will not conflate due to ambiguity.
   */
  public void testNouns5() throws IOException {
    // decl V
    // l -> ļ palatalization
    checkOneTerm(a, "egle",  "egl"); // nom. sing.
    checkOneTerm(a, "egles", "egl"); // nom. pl.
    checkOneTerm(a, "egles", "egl"); // gen. sing.
    checkOneTerm(a, "egļu",  "egl"); // gen. pl.
    checkOneTerm(a, "eglei", "egl"); // dat. sing.
    checkOneTerm(a, "eglēm", "egl"); // dat. pl.
    checkOneTerm(a, "egli",  "egl"); // acc. sing.
    checkOneTerm(a, "egles", "egl"); // acc. pl.
    checkOneTerm(a, "eglē",  "egl"); // loc. sing.
    checkOneTerm(a, "eglēs", "egl"); // loc. pl.
    checkOneTerm(a, "egle",  "egl"); // voc. sing.
    checkOneTerm(a, "egles", "egl"); // voc. pl.
  }
  public void testNouns6() throws IOException {
    // decl VI
    // no palatalization
    checkOneTerm(a, "govs",  "gov"); // nom. sing.
    checkOneTerm(a, "govis", "gov"); // nom. pl.
    checkOneTerm(a, "govs",  "gov"); // gen. sing.
    checkOneTerm(a, "govju", "gov"); // gen. pl.
    checkOneTerm(a, "govij", "gov"); // dat. sing.
    checkOneTerm(a, "govīm", "gov"); // dat. pl.
    checkOneTerm(a, "govi ", "gov"); // acc. sing.
    checkOneTerm(a, "govis", "gov"); // acc. pl.
    checkOneTerm(a, "govi ", "gov"); // inst. sing.
    checkOneTerm(a, "govīm", "gov"); // inst. pl.
    checkOneTerm(a, "govī",  "gov"); // loc. sing.
    checkOneTerm(a, "govīs", "gov"); // loc. pl.
    checkOneTerm(a, "govs",  "gov"); // voc. sing.
    checkOneTerm(a, "govis", "gov"); // voc. pl.
  }
  public void testAdjectives() throws IOException {
    checkOneTerm(a, "zils",     "zil"); // indef. nom. masc. sing.
    checkOneTerm(a, "zilais",   "zil"); // def. nom. masc. sing.
    checkOneTerm(a, "zili",     "zil"); // indef. nom. masc. pl.
    checkOneTerm(a, "zilie",    "zil"); // def. nom. masc. pl.
    checkOneTerm(a, "zila",     "zil"); // indef. nom. fem. sing.
    checkOneTerm(a, "zilā",     "zil"); // def. nom. fem. sing.
    checkOneTerm(a, "zilas",    "zil"); // indef. nom. fem. pl.
    checkOneTerm(a, "zilās",    "zil"); // def. nom. fem. pl.
    checkOneTerm(a, "zila",     "zil"); // indef. gen. masc. sing.
    checkOneTerm(a, "zilā",     "zil"); // def. gen. masc. sing.
    checkOneTerm(a, "zilu",     "zil"); // indef. gen. masc. pl.
    checkOneTerm(a, "zilo",     "zil"); // def. gen. masc. pl.
    checkOneTerm(a, "zilas",    "zil"); // indef. gen. fem. sing.
    checkOneTerm(a, "zilās",    "zil"); // def. gen. fem. sing.
    checkOneTerm(a, "zilu",     "zil"); // indef. gen. fem. pl.
    checkOneTerm(a, "zilo",     "zil"); // def. gen. fem. pl.
    checkOneTerm(a, "zilam",    "zil"); // indef. dat. masc. sing.
    checkOneTerm(a, "zilajam",  "zil"); // def. dat. masc. sing.
    checkOneTerm(a, "ziliem",   "zil"); // indef. dat. masc. pl.
    checkOneTerm(a, "zilajiem", "zil"); // def. dat. masc. pl.
    checkOneTerm(a, "zilai",    "zil"); // indef. dat. fem. sing.
    checkOneTerm(a, "zilajai",  "zil"); // def. dat. fem. sing.
    checkOneTerm(a, "zilām",    "zil"); // indef. dat. fem. pl.
    checkOneTerm(a, "zilajām",  "zil"); // def. dat. fem. pl.
    checkOneTerm(a, "zilu",     "zil"); // indef. acc. masc. sing.
    checkOneTerm(a, "zilo",     "zil"); // def. acc. masc. sing.
    checkOneTerm(a, "zilus",    "zil"); // indef. acc. masc. pl.
    checkOneTerm(a, "zilos",    "zil"); // def. acc. masc. pl.
    checkOneTerm(a, "zilu",     "zil"); // indef. acc. fem. sing.
    checkOneTerm(a, "zilo",     "zil"); // def. acc. fem. sing.
    checkOneTerm(a, "zilās",    "zil"); // indef. acc. fem. pl.
    checkOneTerm(a, "zilās",    "zil"); // def. acc. fem. pl.
    checkOneTerm(a, "zilā",     "zil"); // indef. loc. masc. sing.
    checkOneTerm(a, "zilajā",   "zil"); // def. loc. masc. sing.
    checkOneTerm(a, "zilos",    "zil"); // indef. loc. masc. pl.
    checkOneTerm(a, "zilajos",  "zil"); // def. loc. masc. pl.
    checkOneTerm(a, "zilā",     "zil"); // indef. loc. fem. sing.
    checkOneTerm(a, "zilajā",   "zil"); // def. loc. fem. sing.
    checkOneTerm(a, "zilās",    "zil"); // indef. loc. fem. pl.
    checkOneTerm(a, "zilajās",  "zil"); // def. loc. fem. pl.
    checkOneTerm(a, "zilais",   "zil"); // voc. masc. sing.
    checkOneTerm(a, "zilie",    "zil"); // voc. masc. pl.
    checkOneTerm(a, "zilā",     "zil"); // voc. fem. sing.
    checkOneTerm(a, "zilās",    "zil"); // voc. fem. pl.
  }
  /**
   * Note: we intentionally don't handle the ambiguous
   * (s,t) -> š and (d,z) -> ž
   */
  public void testPalatalization() throws IOException {
    checkOneTerm(a, "krāsns", "krāsn"); // nom. sing.
    checkOneTerm(a, "krāšņu", "krāsn"); // gen. pl.
    checkOneTerm(a, "zvaigzne", "zvaigzn"); // nom. sing.
    checkOneTerm(a, "zvaigžņu", "zvaigzn"); // gen. pl.
    checkOneTerm(a, "kāpslis", "kāpsl"); // nom. sing.
    checkOneTerm(a, "kāpšļu",  "kāpsl"); // gen. pl.
    checkOneTerm(a, "zizlis", "zizl"); // nom. sing.
    checkOneTerm(a, "zižļu",  "zizl"); // gen. pl.
    checkOneTerm(a, "vilnis", "viln"); // nom. sing.
    checkOneTerm(a, "viļņu",  "viln"); // gen. pl.
    checkOneTerm(a, "lelle", "lell"); // nom. sing.
    checkOneTerm(a, "leļļu", "lell"); // gen. pl.
    checkOneTerm(a, "pinne", "pinn"); // nom. sing.
    checkOneTerm(a, "piņņu", "pinn"); // gen. pl.
    checkOneTerm(a, "rīkste", "rīkst"); // nom. sing.
    checkOneTerm(a, "rīkšu",  "rīkst"); // gen. pl.
  }
  /**
   * Test some length restrictions, we require a 3+ char stem,
   * with at least one vowel.
   */
  public void testLength() throws IOException {
    checkOneTerm(a, "usa", "usa"); // length
    checkOneTerm(a, "60ms", "60ms"); // vowel count
  }
 }
--- a/solr/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java
@ -0,0 +1,38 @@
 package org.apache.solr.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.lv.LatvianStemFilter;
 /** 
 * Factory for {@link LatvianStemFilter}. 
 * <pre class="prettyprint" >
 * &lt;fieldType name="text_lvstem" class="solr.TextField" positionIncrementGap="100"&gt;
 *   &lt;analyzer&gt;
 *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
 *     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
 *     &lt;filter class="solr.LatvianStemFilterFactory"/&gt;
 *   &lt;/analyzer&gt;
 * &lt;/fieldType&gt;</pre>
 */
 public class LatvianStemFilterFactory extends BaseTokenFilterFactory {
  public TokenStream create(TokenStream input) {
    return new LatvianStemFilter(input);
  }
 }
--- a/solr/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java
+++ b/solr/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java
@ -0,0 +1,36 @@
 package org.apache.solr.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.Reader;
 import java.io.StringReader;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 /**
 * Simple tests to ensure the Latvian stem factory is working.
 */
 public class TestLatvianStemFilterFactory extends BaseTokenTestCase {
  public void testStemming() throws Exception {
    Reader reader = new StringReader("tirgiem tirgus");
    LatvianStemFilterFactory factory = new LatvianStemFilterFactory();
    TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
    assertTokenStreamContents(stream, new String[] { "tirg", "tirg" });
  }
 }