LUCENE-2437: Indonesian Analyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@942235 13f79535-47bb-0310-9956-ffa450edef68
2010-05-07 21:21:12 +00:00 · 2010-05-07 21:21:12 +00:00 · 1b020be130
parent 37a081173a
commit 1b020be130
10 changed files with 1169 additions and 0 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -152,6 +152,8 @@ New features
   of AttributeSource.cloneAttributes() instances and the new copyTo() method.
   (Steven Rowe via Uwe Schindler)
 * LUCENE-2437: Add an Analyzer for Indonesian.  (Robert Muir)
 Build
 * LUCENE-2124: Moved the JDK-based collation support from contrib/collation 
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java
@ -0,0 +1,130 @@
 package org.apache.lucene.analysis.id;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.io.Reader;
 import java.util.Set;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.KeywordMarkerFilter;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.util.Version;
 /**
 * Analyzer for Indonesian (Bahasa)
 */
 public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
  /** File containing default Indonesian stopwords. */
  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
  /**
   * Returns an unmodifiable instance of the default stop-words set.
   * @return an unmodifiable instance of the default stop-words set.
   */
  public static Set<?> getDefaultStopSet(){
    return DefaultSetHolder.DEFAULT_STOP_SET;
  }
  /**
   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
   * accesses the static final set the first time.;
   */
  private static class DefaultSetHolder {
    static final Set<?> DEFAULT_STOP_SET;
    static {
      try {
        DEFAULT_STOP_SET = loadStopwordSet(false, IndonesianAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
      } catch (IOException ex) {
        // default set should always be present as it is part of the
        // distribution (JAR)
        throw new RuntimeException("Unable to load default stopword set");
      }
    }
  }
  private final Set<?> stemExclusionSet;
  /**
   * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
   */
  public IndonesianAnalyzer(Version matchVersion) {
    this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
  }
  /**
   * Builds an analyzer with the given stop words
   * 
   * @param matchVersion
   *          lucene compatibility version
   * @param stopwords
   *          a stopword set
   */
  public IndonesianAnalyzer(Version matchVersion, Set<?> stopwords){
    this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
  }
  /**
   * Builds an analyzer with the given stop word. If a none-empty stem exclusion set is
   * provided this analyzer will add a {@link KeywordMarkerFilter} before
   * {@link IndonesianStemFilter}.
   * 
   * @param matchVersion
   *          lucene compatibility version
   * @param stopwords
   *          a stopword set
   * @param stemExclusionSet
   *          a set of terms not to be stemmed
   */
  public IndonesianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet){
    super(matchVersion, stopwords);
    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
        matchVersion, stemExclusionSet));
  }
  /**
   * Creates
   * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
   * used to tokenize all the text in the provided {@link Reader}.
   * 
   * @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
   *         built from an {@link StandardTokenizer} filtered with
   *         {@link StandardFilter}, {@link LowerCaseFilter},
   *         {@link StopFilter}, {@link KeywordMarkerFilter}
   *         if a stem exclusion set is provided and {@link IndonesianStemFilter}.
   */
  @Override
  protected TokenStreamComponents createComponents(String fieldName,
      Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(source);
    result = new LowerCaseFilter(matchVersion, source);
    result = new StopFilter(matchVersion, result, stopwords);
    if (!stemExclusionSet.isEmpty()) {
      result = new KeywordMarkerFilter(result, stemExclusionSet);
    }
    return new TokenStreamComponents(source, new IndonesianStemFilter(result));
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilter.java
@ -0,0 +1,67 @@
 package org.apache.lucene.analysis.id;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 /**
 * A {@link TokenFilter} that applies {@link IndonesianStemmer} to stem Indonesian words.
 */
 public final class IndonesianStemFilter extends TokenFilter {
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
  private final IndonesianStemmer stemmer = new IndonesianStemmer();
  private final boolean stemDerivational;
  /**
   * Calls {@link #IndonesianStemFilter(TokenStream, boolean) IndonesianStemFilter(input, true)}
   */
  public IndonesianStemFilter(TokenStream input) {
    this(input, true);
  }
  /**
   * Create a new IndonesianStemFilter.
   * <p>
   * If <code>stemDerivational</code> is false, 
   * only inflectional suffixes (particles and possessive pronouns) are stemmed.
   */
  public IndonesianStemFilter(TokenStream input, boolean stemDerivational) {
    super(input);
    this.stemDerivational = stemDerivational;
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
      if(!keywordAtt.isKeyword()) {
        final int newlen = 
          stemmer.stem(termAtt.buffer(), termAtt.length(), stemDerivational);
        termAtt.setLength(newlen);
      }
      return true;
    } else {
      return false;
    }
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java
@ -0,0 +1,304 @@
 package org.apache.lucene.analysis.id;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /**
 * Stemmer for Indonesian.
 * <p>
 * Stems Indonesian words with the algorithm presented in:
 * <i>A Study of Stemming Effects on Information Retrieval in 
 * Bahasa Indonesia</i>, Fadillah Z Tala.
 * http://www.illc.uva.nl/Publications/ResearchReports/MoL-2003-02.text.pdf
 */
 public class IndonesianStemmer {
  private int numSyllables;
  private int flags;
  private static final int REMOVED_KE = 1;
  private static final int REMOVED_PENG = 2;
  private static final int REMOVED_DI = 4;
  private static final int REMOVED_MENG = 8;
  private static final int REMOVED_TER = 16;
  private static final int REMOVED_BER = 32;
  private static final int REMOVED_PE = 64;
  /**
   * Stem a term (returning its new length).
   * <p>
   * Use <code>stemDerivational</code> to control whether full stemming
   * or only light inflectional stemming is done.
   */
  public int stem(char text[], int length, boolean stemDerivational) {
    flags = 0;
    numSyllables = 0;
    for (int i = 0; i < length; i++)
      if (isVowel(text[i]))
          numSyllables++;
    if (numSyllables > 2) length = removeParticle(text, length);
    if (numSyllables > 2) length = removePossessivePronoun(text, length);
    if (stemDerivational)
      length = stemDerivational(text, length);
    return length;
  }
  private int stemDerivational(char text[], int length) {
    int oldLength = length;
    if (numSyllables > 2) length = removeFirstOrderPrefix(text, length);
    if (oldLength != length) { // a rule is fired
      oldLength = length;
      if (numSyllables > 2) length = removeSuffix(text, length);
      if (oldLength != length) // a rule is fired
        if (numSyllables > 2) length = removeSecondOrderPrefix(text, length);
    } else { // fail
      if (numSyllables > 2) length = removeSecondOrderPrefix(text, length);
      if (numSyllables > 2) length = removeSuffix(text, length);
    }
    return length;
  }
  private boolean isVowel(char ch) {
    switch(ch) {
      case 'a':
      case 'e':
      case 'i':
      case 'o':
      case 'u':
        return true;
      default:
        return false;
    }
  }
  private int removeParticle(char text[], int length) {
    if (endsWith(text, length, "kah") || 
        endsWith(text, length, "lah") || 
        endsWith(text, length, "pun")) {
        numSyllables--;
        return length - 3;
    }
    return length;
  }
  private int removePossessivePronoun(char text[], int length) {
    if (endsWith(text, length, "ku") || endsWith(text, length, "mu")) {
      numSyllables--;
      return length - 2;
    }
    if (endsWith(text, length, "nya")) {
      numSyllables--;
      return length - 3;
    }
    return length;
  }
  private int removeFirstOrderPrefix(char text[], int length) {
    if (startsWith(text, length, "meng")) {
      flags |= REMOVED_MENG;
      numSyllables--;
      return deleteN(text, 0, length, 4);
    }
    if (startsWith(text, length, "meny") && length > 4 && isVowel(text[4])) {
      flags |= REMOVED_MENG;
      text[3] = 's';
      numSyllables--;
      return deleteN(text, 0, length, 3);
    }
    if (startsWith(text, length, "men")) {
      flags |= REMOVED_MENG;
      numSyllables--;
      return deleteN(text, 0, length, 3);
    }
    if (startsWith(text, length, "mem")) {
      flags |= REMOVED_MENG;
      numSyllables--;
      return deleteN(text, 0, length, 3);
    }
    if (startsWith(text, length, "me")) {
      flags |= REMOVED_MENG;
      numSyllables--;
      return deleteN(text, 0, length, 2);
    }
    if (startsWith(text, length, "peng")) {
      flags |= REMOVED_PENG;
      numSyllables--;
      return deleteN(text, 0, length, 4);
    }
    if (startsWith(text, length, "peny") && length > 4 && isVowel(text[4])) {
      flags |= REMOVED_PENG;
      text[3] = 's';
      numSyllables--;
      return deleteN(text, 0, length, 3);
    }
    if (startsWith(text, length, "peny")) {
      flags |= REMOVED_PENG;
      numSyllables--;
      return deleteN(text, 0, length, 4);
    }
    if (startsWith(text, length, "pen") && length > 3 && isVowel(text[3])) {
      flags |= REMOVED_PENG;
      text[2] = 't';
      numSyllables--;
      return deleteN(text, 0, length, 2);
    }
    if (startsWith(text, length, "pen")) {
      flags |= REMOVED_PENG;
      numSyllables--;
      return deleteN(text, 0, length, 3);
    }
    if (startsWith(text, length, "pem")) {
      flags |= REMOVED_PENG;
      numSyllables--;
      return deleteN(text, 0, length, 3);
    }
    if (startsWith(text, length, "di")) {
      flags |= REMOVED_DI;
      numSyllables--;
      return deleteN(text, 0, length, 2);
    }
    if (startsWith(text, length, "ter")) {
      flags |= REMOVED_TER;
      numSyllables--;
      return deleteN(text, 0, length, 3);
    }
    if (startsWith(text, length, "ke")) {
      flags |= REMOVED_KE;
      numSyllables--;
      return deleteN(text, 0, length, 2);
    }
    return length;
  }
  private int removeSecondOrderPrefix(char text[], int length) {
    if (startsWith(text, length, "ber")) {
      flags |= REMOVED_BER;
      numSyllables--;
      return deleteN(text, 0, length, 3);
    }
    if (length == 7 && startsWith(text, length, "belajar")) {
      flags |= REMOVED_BER;
      numSyllables--;
      return deleteN(text, 0, length, 3);
    }
    if (startsWith(text, length, "be") && length > 4 
        && !isVowel(text[2]) && text[3] == 'e' && text[4] == 'r') {
      flags |= REMOVED_BER;
      numSyllables--;
      return deleteN(text, 0, length, 2);
    }
    if (startsWith(text, length, "per")) {
      numSyllables--;
      return deleteN(text, 0, length, 3);
    }
    if (length == 7 && startsWith(text, length, "pelajar")) {
      numSyllables--;
      return deleteN(text, 0, length, 3);
    }
    if (startsWith(text, length, "pe")) {
      flags |= REMOVED_PE;
      numSyllables--;
      return deleteN(text, 0, length, 2);
    }
    return length;
  }
  private int removeSuffix(char text[], int length) {
    if (endsWith(text, length, "kan") 
        && (flags & REMOVED_KE) == 0 
        && (flags & REMOVED_PENG) == 0 
        && (flags & REMOVED_PE) == 0) {
      numSyllables--;
      return length - 3;
    }
    if (endsWith(text, length, "an") 
        && (flags & REMOVED_DI) == 0 
        && (flags & REMOVED_MENG) == 0 
        && (flags & REMOVED_TER) == 0) {
      numSyllables--;
      return length - 2;
    }
    if (endsWith(text, length, "i") 
        && !endsWith(text, length, "si") 
        && (flags & REMOVED_BER) == 0 
        && (flags & REMOVED_KE) == 0 
        && (flags & REMOVED_PENG) == 0) {
      numSyllables--;
      return length - 1;
    }
    return length;
  }
  private boolean startsWith(char s[], int len, String prefix) {
    final int prefixLen = prefix.length();
    if (prefixLen > len)
      return false;
    for (int i = 0; i < prefixLen; i++)
      if (s[i] != prefix.charAt(i)) 
        return false;
    return true;
  }
  private boolean endsWith(char s[], int len, String suffix) {
    final int suffixLen = suffix.length();
    if (suffixLen > len)
      return false;
    for (int i = suffixLen - 1; i >= 0; i--)
      if (s[len -(suffixLen - i)] != suffix.charAt(i))
        return false;
    return true;
  }
  private int deleteN(char s[], int pos, int len, int nChars) {
    for (int i = 0; i < nChars; i++)
      len = delete(s, pos, len);
    return len;
  }
  private int delete(char s[], int pos, int len) {
    if (pos < len) 
      System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
    return len - 1;
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/id/package.html
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/id/package.html
@ -0,0 +1,22 @@
 <!doctype html public "-//w3c//dtd html 4.0 transitional//en">
 <!--
 Licensed to the Apache Software Foundation (ASF) under one or more
 contributor license agreements.  See the NOTICE file distributed with
 this work for additional information regarding copyright ownership.
 The ASF licenses this file to You under the Apache License, Version 2.0
 (the "License"); you may not use this file except in compliance with
 the License.  You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
 <html><head></head>
 <body>
 Analyzer for Indonesian.
 </body>
 </html>
--- a/modules/analysis/common/src/resources/org/apache/lucene/analysis/id/stopwords.txt
+++ b/modules/analysis/common/src/resources/org/apache/lucene/analysis/id/stopwords.txt
@ -0,0 +1,359 @@
 # from appendix D of: A Study of Stemming Effects on Information
 # Retrieval in Bahasa Indonesia
 ada
 adanya
 adalah
 adapun
 agak
 agaknya
 agar
 akan
 akankah
 akhirnya
 aku
 akulah
 amat
 amatlah
 anda
 andalah
 antar
 diantaranya
 antara
 antaranya
 diantara
 apa
 apaan
 mengapa
 apabila
 apakah
 apalagi
 apatah
 atau
 ataukah
 ataupun
 bagai
 bagaikan
 sebagai
 sebagainya
 bagaimana
 bagaimanapun
 sebagaimana
 bagaimanakah
 bagi
 bahkan
 bahwa
 bahwasanya
 sebaliknya
 banyak
 sebanyak
 beberapa
 seberapa
 begini
 beginian
 beginikah
 beginilah
 sebegini
 begitu
 begitukah
 begitulah
 begitupun
 sebegitu
 belum
 belumlah
 sebelum
 sebelumnya
 sebenarnya
 berapa
 berapakah
 berapalah
 berapapun
 betulkah
 sebetulnya
 biasa
 biasanya
 bila
 bilakah
 bisa
 bisakah
 sebisanya
 boleh
 bolehkah
 bolehlah
 buat
 bukan
 bukankah
 bukanlah
 bukannya
 cuma
 percuma
 dahulu
 dalam
 dan
 dapat
 dari
 daripada
 dekat
 demi
 demikian
 demikianlah
 sedemikian
 dengan
 depan
 di
 dia
 dialah
 dini
 diri
 dirinya
 terdiri
 dong
 dulu
 enggak
 enggaknya
 entah
 entahlah
 terhadap
 terhadapnya
 hal
 hampir
 hanya
 hanyalah
 harus
 haruslah
 harusnya
 seharusnya
 hendak
 hendaklah
 hendaknya
 hingga
 sehingga
 ia
 ialah
 ibarat
 ingin
 inginkah
 inginkan
 ini
 inikah
 inilah
 itu
 itukah
 itulah
 jangan
 jangankan
 janganlah
 jika
 jikalau
 juga
 justru
 kala
 kalau
 kalaulah
 kalaupun
 kalian
 kami
 kamilah
 kamu
 kamulah
 kan
 kapan
 kapankah
 kapanpun
 dikarenakan
 karena
 karenanya
 ke
 kecil
 kemudian
 kenapa
 kepada
 kepadanya
 ketika
 seketika
 khususnya
 kini
 kinilah
 kiranya
 sekiranya
 kita
 kitalah
 kok
 lagi
 lagian
 selagi
 lah
 lain
 lainnya
 melainkan
 selaku
 lalu
 melalui
 terlalu
 lama
 lamanya
 selama
 selama
 selamanya
 lebih
 terlebih
 bermacam
 macam
 semacam
 maka
 makanya
 makin
 malah
 malahan
 mampu
 mampukah
 mana
 manakala
 manalagi
 masih
 masihkah
 semasih
 masing
 mau
 maupun
 semaunya
 memang
 mereka
 merekalah
 meski
 meskipun
 semula
 mungkin
 mungkinkah
 nah
 namun
 nanti
 nantinya
 nyaris
 oleh
 olehnya
 seorang
 seseorang
 pada
 padanya
 padahal
 paling
 sepanjang
 pantas
 sepantasnya
 sepantasnyalah
 para
 pasti
 pastilah
 per
 pernah
 pula
 pun
 merupakan
 rupanya
 serupa
 saat
 saatnya
 sesaat
 saja
 sajalah
 saling
 bersama
 sama
 sesama
 sambil
 sampai
 sana
 sangat
 sangatlah
 saya
 sayalah
 se
 sebab
 sebabnya
 sebuah
 tersebut
 tersebutlah
 sedang
 sedangkan
 sedikit
 sedikitnya
 segala
 segalanya
 segera
 sesegera
 sejak
 sejenak
 sekali
 sekalian
 sekalipun
 sesekali
 sekaligus
 sekarang
 sekarang
 sekitar
 sekitarnya
 sela
 selain
 selalu
 seluruh
 seluruhnya
 semakin
 sementara
 sempat
 semua
 semuanya
 sendiri
 sendirinya
 seolah
 seperti
 sepertinya
 sering
 seringnya
 serta
 siapa
 siapakah
 siapapun
 disini
 disinilah
 sini
 sinilah
 sesuatu
 sesuatunya
 suatu
 sesudah
 sesudahnya
 sudah
 sudahkah
 sudahlah
 supaya
 tadi
 tadinya
 tak
 tanpa
 setelah
 telah
 tentang
 tentu
 tentulah
 tentunya
 tertentu
 seterusnya
 tapi
 tetapi
 setiap
 tiap
 setidaknya
 tidak
 tidakkah
 tidaklah
 toh
 waduh
 wah
 wahai
 sewaktu
 walau
 walaupun
 wong
 yaitu
 yakni
 yang
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java
@ -0,0 +1,53 @@
 package org.apache.lucene.analysis.id;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.util.HashSet;
 import java.util.Set;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase {
  /** This test fails with NPE when the 
   * stopwords file is missing in classpath */
  public void testResourcesAvailable() {
    new IndonesianAnalyzer(TEST_VERSION_CURRENT);
  }
  /** test stopwords and stemming */
  public void testBasics() throws IOException {
    Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT);
    // stemming
    checkOneTermReuse(a, "peledakan", "ledak");
    checkOneTermReuse(a, "pembunuhan", "bunuh");
    // stopword
    assertAnalyzesTo(a, "bahwa", new String[] {});
  }
  /** test use of exclusion set */
  public void testExclude() throws IOException {
    Set<String> exclusionSet = new HashSet<String>();
    exclusionSet.add("peledakan");
    Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT, 
        IndonesianAnalyzer.getDefaultStopSet(), exclusionSet);
    checkOneTermReuse(a, "peledakan", "peledakan");
    checkOneTermReuse(a, "pembunuhan", "bunuh");
  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemmer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemmer.java
@ -0,0 +1,136 @@
 package org.apache.lucene.analysis.id;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.io.Reader;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.KeywordTokenizer;
 import org.apache.lucene.analysis.ReusableAnalyzerBase;
 import org.apache.lucene.analysis.Tokenizer;
 /**
 * Tests {@link IndonesianStemmer}
 */
 public class TestIndonesianStemmer extends BaseTokenStreamTestCase {
  /* full stemming, no stopwords */
  Analyzer a = new ReusableAnalyzerBase() {
    @Override
    public TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new KeywordTokenizer(reader);
      return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer));
    }
  };
  /** Some examples from the paper */
  public void testExamples() throws IOException {
    checkOneTerm(a, "bukukah", "buku");
    checkOneTermReuse(a, "adalah", "ada");
    checkOneTermReuse(a, "bukupun", "buku");
    checkOneTermReuse(a, "bukuku", "buku");
    checkOneTermReuse(a, "bukumu", "buku");
    checkOneTermReuse(a, "bukunya", "buku");
    checkOneTermReuse(a, "mengukur", "ukur");
    checkOneTermReuse(a, "menyapu", "sapu");
    checkOneTermReuse(a, "menduga", "duga");
    checkOneTermReuse(a, "menuduh", "uduh");
    checkOneTermReuse(a, "membaca", "baca");
    checkOneTermReuse(a, "merusak", "rusak");
    checkOneTermReuse(a, "pengukur", "ukur");
    checkOneTermReuse(a, "penyapu", "sapu");
    checkOneTermReuse(a, "penduga", "duga");
    checkOneTermReuse(a, "pembaca", "baca");
    checkOneTermReuse(a, "diukur", "ukur");
    checkOneTermReuse(a, "tersapu", "sapu");
    checkOneTermReuse(a, "kekasih", "kasih");
    checkOneTermReuse(a, "berlari", "lari");
    checkOneTermReuse(a, "belajar", "ajar");
    checkOneTermReuse(a, "bekerja", "kerja");
    checkOneTermReuse(a, "perjelas", "jelas");
    checkOneTermReuse(a, "pelajar", "ajar");
    checkOneTermReuse(a, "pekerja", "kerja");
    checkOneTermReuse(a, "tarikkan", "tarik");
    checkOneTermReuse(a, "ambilkan", "ambil");
    checkOneTermReuse(a, "mengambilkan", "ambil");
    checkOneTermReuse(a, "makanan", "makan");
    checkOneTermReuse(a, "janjian", "janji");
    checkOneTermReuse(a, "perjanjian", "janji");
    checkOneTermReuse(a, "tandai", "tanda");
    checkOneTermReuse(a, "dapati", "dapat");
    checkOneTermReuse(a, "mendapati", "dapat");
    checkOneTermReuse(a, "pantai", "panta");
  }
  /** Some detailed analysis examples (that might not be the best) */
  public void testIRExamples() throws IOException {
    checkOneTerm(a, "penyalahgunaan", "salahguna");
    checkOneTermReuse(a, "menyalahgunakan", "salahguna");
    checkOneTermReuse(a, "disalahgunakan", "salahguna");
    checkOneTermReuse(a, "pertanggungjawaban", "tanggungjawab");
    checkOneTermReuse(a, "mempertanggungjawabkan", "tanggungjawab");
    checkOneTermReuse(a, "dipertanggungjawabkan", "tanggungjawab");
    checkOneTermReuse(a, "pelaksanaan", "laksana");
    checkOneTermReuse(a, "pelaksana", "laksana");
    checkOneTermReuse(a, "melaksanakan", "laksana");
    checkOneTermReuse(a, "dilaksanakan", "laksana");
    checkOneTermReuse(a, "melibatkan", "libat");
    checkOneTermReuse(a, "terlibat", "libat");
    checkOneTermReuse(a, "penculikan", "culik");
    checkOneTermReuse(a, "menculik", "culik");
    checkOneTermReuse(a, "diculik", "culik");
    checkOneTermReuse(a, "penculik", "culik");
    checkOneTermReuse(a, "perubahan", "ubah");
    checkOneTermReuse(a, "peledakan", "ledak");
    checkOneTermReuse(a, "penanganan", "tangan");
    checkOneTermReuse(a, "kepolisian", "polisi");
    checkOneTermReuse(a, "kenaikan", "naik");
    checkOneTermReuse(a, "bersenjata", "senjata");
    checkOneTermReuse(a, "penyelewengan", "seleweng");
    checkOneTermReuse(a, "kecelakaan", "celaka");
  }
  /* inflectional-only stemming */
  Analyzer b = new ReusableAnalyzerBase() {
    @Override
    public TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new KeywordTokenizer(reader);
      return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer, false));
    }
  };
  /** Test stemming only inflectional suffixes */
  public void testInflectionalOnly() throws IOException {
    checkOneTerm(b, "bukunya", "buku");
    checkOneTermReuse(b, "bukukah", "buku");
    checkOneTermReuse(b, "bukunyakah", "buku");
    checkOneTermReuse(b, "dibukukannya", "dibukukan");
  }
  public void testShouldntStem() throws IOException {
    checkOneTerm(a, "bersenjata", "senjata");
    checkOneTermReuse(a, "bukukah", "buku");
    checkOneTermReuse(a, "gigi", "gigi");
  }
 }
--- a/solr/src/java/org/apache/solr/analysis/IndonesianStemFilterFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/IndonesianStemFilterFactory.java
@ -0,0 +1,37 @@
 package org.apache.solr.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.util.Map;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.id.IndonesianStemFilter;
 /** Factory for {@link IndonesianStemFilter} */
 public class IndonesianStemFilterFactory extends BaseTokenFilterFactory {
  private boolean stemDerivational = true;
  public void init(Map<String, String> args) {
    super.init(args);
    stemDerivational = getBoolean("stemDerivational", true);
  }
  public TokenStream create(TokenStream input) {
    return new IndonesianStemFilter(input, stemDerivational);
  }
 }
--- a/solr/src/test/org/apache/solr/analysis/TestIndonesianStemFilterFactory.java
+++ b/solr/src/test/org/apache/solr/analysis/TestIndonesianStemFilterFactory.java
@ -0,0 +1,59 @@
 package org.apache.solr.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.HashMap;
 import java.util.Map;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 /**
 * Simple tests to ensure the Indonesian stem filter factory is working.
 */
 public class TestIndonesianStemFilterFactory extends BaseTokenTestCase {
  /**
   * Ensure the filter actually stems text.
   */
  public void testStemming() throws Exception {
    Reader reader = new StringReader("dibukukannya");
    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
    IndonesianStemFilterFactory factory = new IndonesianStemFilterFactory();
    Map<String,String> args = new HashMap<String,String>();
    factory.init(args);
    TokenStream stream = factory.create(tokenizer);
    assertTokenStreamContents(stream, new String[] { "buku" });
  }
  /**
   * Test inflectional-only mode
   */
  public void testStemmingInflectional() throws Exception {
    Reader reader = new StringReader("dibukukannya");
    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
    IndonesianStemFilterFactory factory = new IndonesianStemFilterFactory();
    Map<String,String> args = new HashMap<String,String>();
    args.put("stemDerivational", "false");
    factory.init(args);
    TokenStream stream = factory.create(tokenizer);
    assertTokenStreamContents(stream, new String[] { "dibukukan" });
  }
 }