From 1b020be13029c5006acb55fb71872382f156c016 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 7 May 2010 21:21:12 +0000 Subject: [PATCH] LUCENE-2437: Indonesian Analyzer git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@942235 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 2 + .../analysis/id/IndonesianAnalyzer.java | 130 +++++++ .../analysis/id/IndonesianStemFilter.java | 67 ++++ .../lucene/analysis/id/IndonesianStemmer.java | 304 +++++++++++++++ .../apache/lucene/analysis/id/package.html | 22 ++ .../apache/lucene/analysis/id/stopwords.txt | 359 ++++++++++++++++++ .../analysis/id/TestIndonesianAnalyzer.java | 53 +++ .../analysis/id/TestIndonesianStemmer.java | 136 +++++++ .../analysis/IndonesianStemFilterFactory.java | 37 ++ .../TestIndonesianStemFilterFactory.java | 59 +++ 10 files changed, 1169 insertions(+) create mode 100644 modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java create mode 100644 modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilter.java create mode 100644 modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java create mode 100644 modules/analysis/common/src/java/org/apache/lucene/analysis/id/package.html create mode 100644 modules/analysis/common/src/resources/org/apache/lucene/analysis/id/stopwords.txt create mode 100644 modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java create mode 100644 modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemmer.java create mode 100644 solr/src/java/org/apache/solr/analysis/IndonesianStemFilterFactory.java create mode 100644 solr/src/test/org/apache/solr/analysis/TestIndonesianStemFilterFactory.java diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 0345833c1d2..89246f2ad22 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -152,6 +152,8 @@ New features of AttributeSource.cloneAttributes() instances and the new copyTo() method. (Steven Rowe via Uwe Schindler) + * LUCENE-2437: Add an Analyzer for Indonesian. (Robert Muir) + Build * LUCENE-2124: Moved the JDK-based collation support from contrib/collation diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java new file mode 100644 index 00000000000..ed1271efd54 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java @@ -0,0 +1,130 @@ +package org.apache.lucene.analysis.id; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.util.Set; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.KeywordMarkerFilter; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.StopwordAnalyzerBase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.util.Version; + +/** + * Analyzer for Indonesian (Bahasa) + */ +public final class IndonesianAnalyzer extends StopwordAnalyzerBase { + /** File containing default Indonesian stopwords. */ + public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; + + /** + * Returns an unmodifiable instance of the default stop-words set. + * @return an unmodifiable instance of the default stop-words set. + */ + public static Set getDefaultStopSet(){ + return DefaultSetHolder.DEFAULT_STOP_SET; + } + + /** + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class + * accesses the static final set the first time.; + */ + private static class DefaultSetHolder { + static final Set DEFAULT_STOP_SET; + + static { + try { + DEFAULT_STOP_SET = loadStopwordSet(false, IndonesianAnalyzer.class, DEFAULT_STOPWORD_FILE, "#"); + } catch (IOException ex) { + // default set should always be present as it is part of the + // distribution (JAR) + throw new RuntimeException("Unable to load default stopword set"); + } + } + } + + private final Set stemExclusionSet; + + /** + * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. + */ + public IndonesianAnalyzer(Version matchVersion) { + this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); + } + + /** + * Builds an analyzer with the given stop words + * + * @param matchVersion + * lucene compatibility version + * @param stopwords + * a stopword set + */ + public IndonesianAnalyzer(Version matchVersion, Set stopwords){ + this(matchVersion, stopwords, CharArraySet.EMPTY_SET); + } + + /** + * Builds an analyzer with the given stop word. If a none-empty stem exclusion set is + * provided this analyzer will add a {@link KeywordMarkerFilter} before + * {@link IndonesianStemFilter}. + * + * @param matchVersion + * lucene compatibility version + * @param stopwords + * a stopword set + * @param stemExclusionSet + * a set of terms not to be stemmed + */ + public IndonesianAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet){ + super(matchVersion, stopwords); + this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( + matchVersion, stemExclusionSet)); + } + + /** + * Creates + * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents} + * used to tokenize all the text in the provided {@link Reader}. + * + * @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents} + * built from an {@link StandardTokenizer} filtered with + * {@link StandardFilter}, {@link LowerCaseFilter}, + * {@link StopFilter}, {@link KeywordMarkerFilter} + * if a stem exclusion set is provided and {@link IndonesianStemFilter}. + */ + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + final Tokenizer source = new StandardTokenizer(matchVersion, reader); + TokenStream result = new StandardFilter(source); + result = new LowerCaseFilter(matchVersion, source); + result = new StopFilter(matchVersion, result, stopwords); + if (!stemExclusionSet.isEmpty()) { + result = new KeywordMarkerFilter(result, stemExclusionSet); + } + return new TokenStreamComponents(source, new IndonesianStemFilter(result)); + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilter.java new file mode 100644 index 00000000000..517151fdf49 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilter.java @@ -0,0 +1,67 @@ +package org.apache.lucene.analysis.id; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link IndonesianStemmer} to stem Indonesian words. + */ +public final class IndonesianStemFilter extends TokenFilter { + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); + private final IndonesianStemmer stemmer = new IndonesianStemmer(); + private final boolean stemDerivational; + + /** + * Calls {@link #IndonesianStemFilter(TokenStream, boolean) IndonesianStemFilter(input, true)} + */ + public IndonesianStemFilter(TokenStream input) { + this(input, true); + } + + /** + * Create a new IndonesianStemFilter. + *

+ * If stemDerivational is false, + * only inflectional suffixes (particles and possessive pronouns) are stemmed. + */ + public IndonesianStemFilter(TokenStream input, boolean stemDerivational) { + super(input); + this.stemDerivational = stemDerivational; + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if(!keywordAtt.isKeyword()) { + final int newlen = + stemmer.stem(termAtt.buffer(), termAtt.length(), stemDerivational); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java new file mode 100644 index 00000000000..82afce8434a --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java @@ -0,0 +1,304 @@ +package org.apache.lucene.analysis.id; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Stemmer for Indonesian. + *

+ * Stems Indonesian words with the algorithm presented in: + * A Study of Stemming Effects on Information Retrieval in + * Bahasa Indonesia, Fadillah Z Tala. + * http://www.illc.uva.nl/Publications/ResearchReports/MoL-2003-02.text.pdf + */ +public class IndonesianStemmer { + private int numSyllables; + private int flags; + private static final int REMOVED_KE = 1; + private static final int REMOVED_PENG = 2; + private static final int REMOVED_DI = 4; + private static final int REMOVED_MENG = 8; + private static final int REMOVED_TER = 16; + private static final int REMOVED_BER = 32; + private static final int REMOVED_PE = 64; + + /** + * Stem a term (returning its new length). + *

+ * Use stemDerivational to control whether full stemming + * or only light inflectional stemming is done. + */ + public int stem(char text[], int length, boolean stemDerivational) { + flags = 0; + numSyllables = 0; + for (int i = 0; i < length; i++) + if (isVowel(text[i])) + numSyllables++; + + if (numSyllables > 2) length = removeParticle(text, length); + if (numSyllables > 2) length = removePossessivePronoun(text, length); + + if (stemDerivational) + length = stemDerivational(text, length); + return length; + } + + private int stemDerivational(char text[], int length) { + int oldLength = length; + if (numSyllables > 2) length = removeFirstOrderPrefix(text, length); + if (oldLength != length) { // a rule is fired + oldLength = length; + if (numSyllables > 2) length = removeSuffix(text, length); + if (oldLength != length) // a rule is fired + if (numSyllables > 2) length = removeSecondOrderPrefix(text, length); + } else { // fail + if (numSyllables > 2) length = removeSecondOrderPrefix(text, length); + if (numSyllables > 2) length = removeSuffix(text, length); + } + return length; + } + + private boolean isVowel(char ch) { + switch(ch) { + case 'a': + case 'e': + case 'i': + case 'o': + case 'u': + return true; + default: + return false; + } + } + + private int removeParticle(char text[], int length) { + if (endsWith(text, length, "kah") || + endsWith(text, length, "lah") || + endsWith(text, length, "pun")) { + numSyllables--; + return length - 3; + } + + return length; + } + + private int removePossessivePronoun(char text[], int length) { + if (endsWith(text, length, "ku") || endsWith(text, length, "mu")) { + numSyllables--; + return length - 2; + } + + if (endsWith(text, length, "nya")) { + numSyllables--; + return length - 3; + } + + return length; + } + + private int removeFirstOrderPrefix(char text[], int length) { + if (startsWith(text, length, "meng")) { + flags |= REMOVED_MENG; + numSyllables--; + return deleteN(text, 0, length, 4); + } + + if (startsWith(text, length, "meny") && length > 4 && isVowel(text[4])) { + flags |= REMOVED_MENG; + text[3] = 's'; + numSyllables--; + return deleteN(text, 0, length, 3); + } + + if (startsWith(text, length, "men")) { + flags |= REMOVED_MENG; + numSyllables--; + return deleteN(text, 0, length, 3); + } + + if (startsWith(text, length, "mem")) { + flags |= REMOVED_MENG; + numSyllables--; + return deleteN(text, 0, length, 3); + } + + if (startsWith(text, length, "me")) { + flags |= REMOVED_MENG; + numSyllables--; + return deleteN(text, 0, length, 2); + } + + if (startsWith(text, length, "peng")) { + flags |= REMOVED_PENG; + numSyllables--; + return deleteN(text, 0, length, 4); + } + + if (startsWith(text, length, "peny") && length > 4 && isVowel(text[4])) { + flags |= REMOVED_PENG; + text[3] = 's'; + numSyllables--; + return deleteN(text, 0, length, 3); + } + + if (startsWith(text, length, "peny")) { + flags |= REMOVED_PENG; + numSyllables--; + return deleteN(text, 0, length, 4); + } + + if (startsWith(text, length, "pen") && length > 3 && isVowel(text[3])) { + flags |= REMOVED_PENG; + text[2] = 't'; + numSyllables--; + return deleteN(text, 0, length, 2); + } + + if (startsWith(text, length, "pen")) { + flags |= REMOVED_PENG; + numSyllables--; + return deleteN(text, 0, length, 3); + } + + if (startsWith(text, length, "pem")) { + flags |= REMOVED_PENG; + numSyllables--; + return deleteN(text, 0, length, 3); + } + + if (startsWith(text, length, "di")) { + flags |= REMOVED_DI; + numSyllables--; + return deleteN(text, 0, length, 2); + } + + if (startsWith(text, length, "ter")) { + flags |= REMOVED_TER; + numSyllables--; + return deleteN(text, 0, length, 3); + } + + if (startsWith(text, length, "ke")) { + flags |= REMOVED_KE; + numSyllables--; + return deleteN(text, 0, length, 2); + } + + return length; + } + + private int removeSecondOrderPrefix(char text[], int length) { + if (startsWith(text, length, "ber")) { + flags |= REMOVED_BER; + numSyllables--; + return deleteN(text, 0, length, 3); + } + + if (length == 7 && startsWith(text, length, "belajar")) { + flags |= REMOVED_BER; + numSyllables--; + return deleteN(text, 0, length, 3); + } + + if (startsWith(text, length, "be") && length > 4 + && !isVowel(text[2]) && text[3] == 'e' && text[4] == 'r') { + flags |= REMOVED_BER; + numSyllables--; + return deleteN(text, 0, length, 2); + } + + if (startsWith(text, length, "per")) { + numSyllables--; + return deleteN(text, 0, length, 3); + } + + if (length == 7 && startsWith(text, length, "pelajar")) { + numSyllables--; + return deleteN(text, 0, length, 3); + } + + if (startsWith(text, length, "pe")) { + flags |= REMOVED_PE; + numSyllables--; + return deleteN(text, 0, length, 2); + } + + return length; + } + + private int removeSuffix(char text[], int length) { + if (endsWith(text, length, "kan") + && (flags & REMOVED_KE) == 0 + && (flags & REMOVED_PENG) == 0 + && (flags & REMOVED_PE) == 0) { + numSyllables--; + return length - 3; + } + + if (endsWith(text, length, "an") + && (flags & REMOVED_DI) == 0 + && (flags & REMOVED_MENG) == 0 + && (flags & REMOVED_TER) == 0) { + numSyllables--; + return length - 2; + } + + if (endsWith(text, length, "i") + && !endsWith(text, length, "si") + && (flags & REMOVED_BER) == 0 + && (flags & REMOVED_KE) == 0 + && (flags & REMOVED_PENG) == 0) { + numSyllables--; + return length - 1; + } + return length; + } + + private boolean startsWith(char s[], int len, String prefix) { + final int prefixLen = prefix.length(); + if (prefixLen > len) + return false; + for (int i = 0; i < prefixLen; i++) + if (s[i] != prefix.charAt(i)) + return false; + return true; + } + + private boolean endsWith(char s[], int len, String suffix) { + final int suffixLen = suffix.length(); + if (suffixLen > len) + return false; + for (int i = suffixLen - 1; i >= 0; i--) + if (s[len -(suffixLen - i)] != suffix.charAt(i)) + return false; + + return true; + } + + private int deleteN(char s[], int pos, int len, int nChars) { + for (int i = 0; i < nChars; i++) + len = delete(s, pos, len); + return len; + } + + private int delete(char s[], int pos, int len) { + if (pos < len) + System.arraycopy(s, pos + 1, s, pos, len - pos - 1); + + return len - 1; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/id/package.html b/modules/analysis/common/src/java/org/apache/lucene/analysis/id/package.html new file mode 100644 index 00000000000..52786713ab6 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/id/package.html @@ -0,0 +1,22 @@ + + + + +Analyzer for Indonesian. + + diff --git a/modules/analysis/common/src/resources/org/apache/lucene/analysis/id/stopwords.txt b/modules/analysis/common/src/resources/org/apache/lucene/analysis/id/stopwords.txt new file mode 100644 index 00000000000..4617f83a5c5 --- /dev/null +++ b/modules/analysis/common/src/resources/org/apache/lucene/analysis/id/stopwords.txt @@ -0,0 +1,359 @@ +# from appendix D of: A Study of Stemming Effects on Information +# Retrieval in Bahasa Indonesia +ada +adanya +adalah +adapun +agak +agaknya +agar +akan +akankah +akhirnya +aku +akulah +amat +amatlah +anda +andalah +antar +diantaranya +antara +antaranya +diantara +apa +apaan +mengapa +apabila +apakah +apalagi +apatah +atau +ataukah +ataupun +bagai +bagaikan +sebagai +sebagainya +bagaimana +bagaimanapun +sebagaimana +bagaimanakah +bagi +bahkan +bahwa +bahwasanya +sebaliknya +banyak +sebanyak +beberapa +seberapa +begini +beginian +beginikah +beginilah +sebegini +begitu +begitukah +begitulah +begitupun +sebegitu +belum +belumlah +sebelum +sebelumnya +sebenarnya +berapa +berapakah +berapalah +berapapun +betulkah +sebetulnya +biasa +biasanya +bila +bilakah +bisa +bisakah +sebisanya +boleh +bolehkah +bolehlah +buat +bukan +bukankah +bukanlah +bukannya +cuma +percuma +dahulu +dalam +dan +dapat +dari +daripada +dekat +demi +demikian +demikianlah +sedemikian +dengan +depan +di +dia +dialah +dini +diri +dirinya +terdiri +dong +dulu +enggak +enggaknya +entah +entahlah +terhadap +terhadapnya +hal +hampir +hanya +hanyalah +harus +haruslah +harusnya +seharusnya +hendak +hendaklah +hendaknya +hingga +sehingga +ia +ialah +ibarat +ingin +inginkah +inginkan +ini +inikah +inilah +itu +itukah +itulah +jangan +jangankan +janganlah +jika +jikalau +juga +justru +kala +kalau +kalaulah +kalaupun +kalian +kami +kamilah +kamu +kamulah +kan +kapan +kapankah +kapanpun +dikarenakan +karena +karenanya +ke +kecil +kemudian +kenapa +kepada +kepadanya +ketika +seketika +khususnya +kini +kinilah +kiranya +sekiranya +kita +kitalah +kok +lagi +lagian +selagi +lah +lain +lainnya +melainkan +selaku +lalu +melalui +terlalu +lama +lamanya +selama +selama +selamanya +lebih +terlebih +bermacam +macam +semacam +maka +makanya +makin +malah +malahan +mampu +mampukah +mana +manakala +manalagi +masih +masihkah +semasih +masing +mau +maupun +semaunya +memang +mereka +merekalah +meski +meskipun +semula +mungkin +mungkinkah +nah +namun +nanti +nantinya +nyaris +oleh +olehnya +seorang +seseorang +pada +padanya +padahal +paling +sepanjang +pantas +sepantasnya +sepantasnyalah +para +pasti +pastilah +per +pernah +pula +pun +merupakan +rupanya +serupa +saat +saatnya +sesaat +saja +sajalah +saling +bersama +sama +sesama +sambil +sampai +sana +sangat +sangatlah +saya +sayalah +se +sebab +sebabnya +sebuah +tersebut +tersebutlah +sedang +sedangkan +sedikit +sedikitnya +segala +segalanya +segera +sesegera +sejak +sejenak +sekali +sekalian +sekalipun +sesekali +sekaligus +sekarang +sekarang +sekitar +sekitarnya +sela +selain +selalu +seluruh +seluruhnya +semakin +sementara +sempat +semua +semuanya +sendiri +sendirinya +seolah +seperti +sepertinya +sering +seringnya +serta +siapa +siapakah +siapapun +disini +disinilah +sini +sinilah +sesuatu +sesuatunya +suatu +sesudah +sesudahnya +sudah +sudahkah +sudahlah +supaya +tadi +tadinya +tak +tanpa +setelah +telah +tentang +tentu +tentulah +tentunya +tertentu +seterusnya +tapi +tetapi +setiap +tiap +setidaknya +tidak +tidakkah +tidaklah +toh +waduh +wah +wahai +sewaktu +walau +walaupun +wong +yaitu +yakni +yang diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java new file mode 100644 index 00000000000..28877a700e9 --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java @@ -0,0 +1,53 @@ +package org.apache.lucene.analysis.id; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; + +public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase { + /** This test fails with NPE when the + * stopwords file is missing in classpath */ + public void testResourcesAvailable() { + new IndonesianAnalyzer(TEST_VERSION_CURRENT); + } + + /** test stopwords and stemming */ + public void testBasics() throws IOException { + Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT); + // stemming + checkOneTermReuse(a, "peledakan", "ledak"); + checkOneTermReuse(a, "pembunuhan", "bunuh"); + // stopword + assertAnalyzesTo(a, "bahwa", new String[] {}); + } + + /** test use of exclusion set */ + public void testExclude() throws IOException { + Set exclusionSet = new HashSet(); + exclusionSet.add("peledakan"); + Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT, + IndonesianAnalyzer.getDefaultStopSet(), exclusionSet); + checkOneTermReuse(a, "peledakan", "peledakan"); + checkOneTermReuse(a, "pembunuhan", "bunuh"); + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemmer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemmer.java new file mode 100644 index 00000000000..09c3c94252b --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemmer.java @@ -0,0 +1,136 @@ +package org.apache.lucene.analysis.id; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.KeywordTokenizer; +import org.apache.lucene.analysis.ReusableAnalyzerBase; +import org.apache.lucene.analysis.Tokenizer; + +/** + * Tests {@link IndonesianStemmer} + */ +public class TestIndonesianStemmer extends BaseTokenStreamTestCase { + /* full stemming, no stopwords */ + Analyzer a = new ReusableAnalyzerBase() { + @Override + public TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer)); + } + }; + + /** Some examples from the paper */ + public void testExamples() throws IOException { + checkOneTerm(a, "bukukah", "buku"); + checkOneTermReuse(a, "adalah", "ada"); + checkOneTermReuse(a, "bukupun", "buku"); + checkOneTermReuse(a, "bukuku", "buku"); + checkOneTermReuse(a, "bukumu", "buku"); + checkOneTermReuse(a, "bukunya", "buku"); + checkOneTermReuse(a, "mengukur", "ukur"); + checkOneTermReuse(a, "menyapu", "sapu"); + checkOneTermReuse(a, "menduga", "duga"); + checkOneTermReuse(a, "menuduh", "uduh"); + checkOneTermReuse(a, "membaca", "baca"); + checkOneTermReuse(a, "merusak", "rusak"); + checkOneTermReuse(a, "pengukur", "ukur"); + checkOneTermReuse(a, "penyapu", "sapu"); + checkOneTermReuse(a, "penduga", "duga"); + checkOneTermReuse(a, "pembaca", "baca"); + checkOneTermReuse(a, "diukur", "ukur"); + checkOneTermReuse(a, "tersapu", "sapu"); + checkOneTermReuse(a, "kekasih", "kasih"); + checkOneTermReuse(a, "berlari", "lari"); + checkOneTermReuse(a, "belajar", "ajar"); + checkOneTermReuse(a, "bekerja", "kerja"); + checkOneTermReuse(a, "perjelas", "jelas"); + checkOneTermReuse(a, "pelajar", "ajar"); + checkOneTermReuse(a, "pekerja", "kerja"); + checkOneTermReuse(a, "tarikkan", "tarik"); + checkOneTermReuse(a, "ambilkan", "ambil"); + checkOneTermReuse(a, "mengambilkan", "ambil"); + checkOneTermReuse(a, "makanan", "makan"); + checkOneTermReuse(a, "janjian", "janji"); + checkOneTermReuse(a, "perjanjian", "janji"); + checkOneTermReuse(a, "tandai", "tanda"); + checkOneTermReuse(a, "dapati", "dapat"); + checkOneTermReuse(a, "mendapati", "dapat"); + checkOneTermReuse(a, "pantai", "panta"); + } + + /** Some detailed analysis examples (that might not be the best) */ + public void testIRExamples() throws IOException { + checkOneTerm(a, "penyalahgunaan", "salahguna"); + checkOneTermReuse(a, "menyalahgunakan", "salahguna"); + checkOneTermReuse(a, "disalahgunakan", "salahguna"); + + checkOneTermReuse(a, "pertanggungjawaban", "tanggungjawab"); + checkOneTermReuse(a, "mempertanggungjawabkan", "tanggungjawab"); + checkOneTermReuse(a, "dipertanggungjawabkan", "tanggungjawab"); + + checkOneTermReuse(a, "pelaksanaan", "laksana"); + checkOneTermReuse(a, "pelaksana", "laksana"); + checkOneTermReuse(a, "melaksanakan", "laksana"); + checkOneTermReuse(a, "dilaksanakan", "laksana"); + + checkOneTermReuse(a, "melibatkan", "libat"); + checkOneTermReuse(a, "terlibat", "libat"); + + checkOneTermReuse(a, "penculikan", "culik"); + checkOneTermReuse(a, "menculik", "culik"); + checkOneTermReuse(a, "diculik", "culik"); + checkOneTermReuse(a, "penculik", "culik"); + + checkOneTermReuse(a, "perubahan", "ubah"); + checkOneTermReuse(a, "peledakan", "ledak"); + checkOneTermReuse(a, "penanganan", "tangan"); + checkOneTermReuse(a, "kepolisian", "polisi"); + checkOneTermReuse(a, "kenaikan", "naik"); + checkOneTermReuse(a, "bersenjata", "senjata"); + checkOneTermReuse(a, "penyelewengan", "seleweng"); + checkOneTermReuse(a, "kecelakaan", "celaka"); + } + + /* inflectional-only stemming */ + Analyzer b = new ReusableAnalyzerBase() { + @Override + public TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer, false)); + } + }; + + /** Test stemming only inflectional suffixes */ + public void testInflectionalOnly() throws IOException { + checkOneTerm(b, "bukunya", "buku"); + checkOneTermReuse(b, "bukukah", "buku"); + checkOneTermReuse(b, "bukunyakah", "buku"); + checkOneTermReuse(b, "dibukukannya", "dibukukan"); + } + + public void testShouldntStem() throws IOException { + checkOneTerm(a, "bersenjata", "senjata"); + checkOneTermReuse(a, "bukukah", "buku"); + checkOneTermReuse(a, "gigi", "gigi"); + } +} diff --git a/solr/src/java/org/apache/solr/analysis/IndonesianStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/IndonesianStemFilterFactory.java new file mode 100644 index 00000000000..06144903872 --- /dev/null +++ b/solr/src/java/org/apache/solr/analysis/IndonesianStemFilterFactory.java @@ -0,0 +1,37 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.id.IndonesianStemFilter; + +/** Factory for {@link IndonesianStemFilter} */ +public class IndonesianStemFilterFactory extends BaseTokenFilterFactory { + private boolean stemDerivational = true; + + public void init(Map args) { + super.init(args); + stemDerivational = getBoolean("stemDerivational", true); + } + + public TokenStream create(TokenStream input) { + return new IndonesianStemFilter(input, stemDerivational); + } +} diff --git a/solr/src/test/org/apache/solr/analysis/TestIndonesianStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestIndonesianStemFilterFactory.java new file mode 100644 index 00000000000..37a35af848e --- /dev/null +++ b/solr/src/test/org/apache/solr/analysis/TestIndonesianStemFilterFactory.java @@ -0,0 +1,59 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; + +/** + * Simple tests to ensure the Indonesian stem filter factory is working. + */ +public class TestIndonesianStemFilterFactory extends BaseTokenTestCase { + /** + * Ensure the filter actually stems text. + */ + public void testStemming() throws Exception { + Reader reader = new StringReader("dibukukannya"); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); + IndonesianStemFilterFactory factory = new IndonesianStemFilterFactory(); + Map args = new HashMap(); + factory.init(args); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "buku" }); + } + + /** + * Test inflectional-only mode + */ + public void testStemmingInflectional() throws Exception { + Reader reader = new StringReader("dibukukannya"); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); + IndonesianStemFilterFactory factory = new IndonesianStemFilterFactory(); + Map args = new HashMap(); + args.put("stemDerivational", "false"); + factory.init(args); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "dibukukan" }); + } +}