LUCENE-2437: Indonesian Analyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@942235 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-05-07 21:21:12 +00:00
parent 37a081173a
commit 1b020be130
10 changed files with 1169 additions and 0 deletions

View File

@ -152,6 +152,8 @@ New features
of AttributeSource.cloneAttributes() instances and the new copyTo() method. of AttributeSource.cloneAttributes() instances and the new copyTo() method.
(Steven Rowe via Uwe Schindler) (Steven Rowe via Uwe Schindler)
* LUCENE-2437: Add an Analyzer for Indonesian. (Robert Muir)
Build Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation * LUCENE-2124: Moved the JDK-based collation support from contrib/collation

View File

@ -0,0 +1,130 @@
package org.apache.lucene.analysis.id;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.KeywordMarkerFilter;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
/**
* Analyzer for Indonesian (Bahasa)
*/
public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
/** File containing default Indonesian stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
public static Set<?> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
/**
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET = loadStopwordSet(false, IndonesianAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
}
private final Set<?> stemExclusionSet;
/**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/
public IndonesianAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Builds an analyzer with the given stop words
*
* @param matchVersion
* lucene compatibility version
* @param stopwords
* a stopword set
*/
public IndonesianAnalyzer(Version matchVersion, Set<?> stopwords){
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
}
/**
* Builds an analyzer with the given stop word. If a none-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* {@link IndonesianStemFilter}.
*
* @param matchVersion
* lucene compatibility version
* @param stopwords
* a stopword set
* @param stemExclusionSet
* a set of terms not to be stemmed
*/
public IndonesianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet){
super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
}
/**
* Creates
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
* used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter},
* {@link StopFilter}, {@link KeywordMarkerFilter}
* if a stem exclusion set is provided and {@link IndonesianStemFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
result = new LowerCaseFilter(matchVersion, source);
result = new StopFilter(matchVersion, result, stopwords);
if (!stemExclusionSet.isEmpty()) {
result = new KeywordMarkerFilter(result, stemExclusionSet);
}
return new TokenStreamComponents(source, new IndonesianStemFilter(result));
}
}

View File

@ -0,0 +1,67 @@
package org.apache.lucene.analysis.id;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link IndonesianStemmer} to stem Indonesian words.
*/
public final class IndonesianStemFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
private final IndonesianStemmer stemmer = new IndonesianStemmer();
private final boolean stemDerivational;
/**
* Calls {@link #IndonesianStemFilter(TokenStream, boolean) IndonesianStemFilter(input, true)}
*/
public IndonesianStemFilter(TokenStream input) {
this(input, true);
}
/**
* Create a new IndonesianStemFilter.
* <p>
* If <code>stemDerivational</code> is false,
* only inflectional suffixes (particles and possessive pronouns) are stemmed.
*/
public IndonesianStemFilter(TokenStream input, boolean stemDerivational) {
super(input);
this.stemDerivational = stemDerivational;
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if(!keywordAtt.isKeyword()) {
final int newlen =
stemmer.stem(termAtt.buffer(), termAtt.length(), stemDerivational);
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,304 @@
package org.apache.lucene.analysis.id;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Stemmer for Indonesian.
* <p>
* Stems Indonesian words with the algorithm presented in:
* <i>A Study of Stemming Effects on Information Retrieval in
* Bahasa Indonesia</i>, Fadillah Z Tala.
* http://www.illc.uva.nl/Publications/ResearchReports/MoL-2003-02.text.pdf
*/
public class IndonesianStemmer {
private int numSyllables;
private int flags;
private static final int REMOVED_KE = 1;
private static final int REMOVED_PENG = 2;
private static final int REMOVED_DI = 4;
private static final int REMOVED_MENG = 8;
private static final int REMOVED_TER = 16;
private static final int REMOVED_BER = 32;
private static final int REMOVED_PE = 64;
/**
* Stem a term (returning its new length).
* <p>
* Use <code>stemDerivational</code> to control whether full stemming
* or only light inflectional stemming is done.
*/
public int stem(char text[], int length, boolean stemDerivational) {
flags = 0;
numSyllables = 0;
for (int i = 0; i < length; i++)
if (isVowel(text[i]))
numSyllables++;
if (numSyllables > 2) length = removeParticle(text, length);
if (numSyllables > 2) length = removePossessivePronoun(text, length);
if (stemDerivational)
length = stemDerivational(text, length);
return length;
}
private int stemDerivational(char text[], int length) {
int oldLength = length;
if (numSyllables > 2) length = removeFirstOrderPrefix(text, length);
if (oldLength != length) { // a rule is fired
oldLength = length;
if (numSyllables > 2) length = removeSuffix(text, length);
if (oldLength != length) // a rule is fired
if (numSyllables > 2) length = removeSecondOrderPrefix(text, length);
} else { // fail
if (numSyllables > 2) length = removeSecondOrderPrefix(text, length);
if (numSyllables > 2) length = removeSuffix(text, length);
}
return length;
}
private boolean isVowel(char ch) {
switch(ch) {
case 'a':
case 'e':
case 'i':
case 'o':
case 'u':
return true;
default:
return false;
}
}
private int removeParticle(char text[], int length) {
if (endsWith(text, length, "kah") ||
endsWith(text, length, "lah") ||
endsWith(text, length, "pun")) {
numSyllables--;
return length - 3;
}
return length;
}
private int removePossessivePronoun(char text[], int length) {
if (endsWith(text, length, "ku") || endsWith(text, length, "mu")) {
numSyllables--;
return length - 2;
}
if (endsWith(text, length, "nya")) {
numSyllables--;
return length - 3;
}
return length;
}
private int removeFirstOrderPrefix(char text[], int length) {
if (startsWith(text, length, "meng")) {
flags |= REMOVED_MENG;
numSyllables--;
return deleteN(text, 0, length, 4);
}
if (startsWith(text, length, "meny") && length > 4 && isVowel(text[4])) {
flags |= REMOVED_MENG;
text[3] = 's';
numSyllables--;
return deleteN(text, 0, length, 3);
}
if (startsWith(text, length, "men")) {
flags |= REMOVED_MENG;
numSyllables--;
return deleteN(text, 0, length, 3);
}
if (startsWith(text, length, "mem")) {
flags |= REMOVED_MENG;
numSyllables--;
return deleteN(text, 0, length, 3);
}
if (startsWith(text, length, "me")) {
flags |= REMOVED_MENG;
numSyllables--;
return deleteN(text, 0, length, 2);
}
if (startsWith(text, length, "peng")) {
flags |= REMOVED_PENG;
numSyllables--;
return deleteN(text, 0, length, 4);
}
if (startsWith(text, length, "peny") && length > 4 && isVowel(text[4])) {
flags |= REMOVED_PENG;
text[3] = 's';
numSyllables--;
return deleteN(text, 0, length, 3);
}
if (startsWith(text, length, "peny")) {
flags |= REMOVED_PENG;
numSyllables--;
return deleteN(text, 0, length, 4);
}
if (startsWith(text, length, "pen") && length > 3 && isVowel(text[3])) {
flags |= REMOVED_PENG;
text[2] = 't';
numSyllables--;
return deleteN(text, 0, length, 2);
}
if (startsWith(text, length, "pen")) {
flags |= REMOVED_PENG;
numSyllables--;
return deleteN(text, 0, length, 3);
}
if (startsWith(text, length, "pem")) {
flags |= REMOVED_PENG;
numSyllables--;
return deleteN(text, 0, length, 3);
}
if (startsWith(text, length, "di")) {
flags |= REMOVED_DI;
numSyllables--;
return deleteN(text, 0, length, 2);
}
if (startsWith(text, length, "ter")) {
flags |= REMOVED_TER;
numSyllables--;
return deleteN(text, 0, length, 3);
}
if (startsWith(text, length, "ke")) {
flags |= REMOVED_KE;
numSyllables--;
return deleteN(text, 0, length, 2);
}
return length;
}
private int removeSecondOrderPrefix(char text[], int length) {
if (startsWith(text, length, "ber")) {
flags |= REMOVED_BER;
numSyllables--;
return deleteN(text, 0, length, 3);
}
if (length == 7 && startsWith(text, length, "belajar")) {
flags |= REMOVED_BER;
numSyllables--;
return deleteN(text, 0, length, 3);
}
if (startsWith(text, length, "be") && length > 4
&& !isVowel(text[2]) && text[3] == 'e' && text[4] == 'r') {
flags |= REMOVED_BER;
numSyllables--;
return deleteN(text, 0, length, 2);
}
if (startsWith(text, length, "per")) {
numSyllables--;
return deleteN(text, 0, length, 3);
}
if (length == 7 && startsWith(text, length, "pelajar")) {
numSyllables--;
return deleteN(text, 0, length, 3);
}
if (startsWith(text, length, "pe")) {
flags |= REMOVED_PE;
numSyllables--;
return deleteN(text, 0, length, 2);
}
return length;
}
private int removeSuffix(char text[], int length) {
if (endsWith(text, length, "kan")
&& (flags & REMOVED_KE) == 0
&& (flags & REMOVED_PENG) == 0
&& (flags & REMOVED_PE) == 0) {
numSyllables--;
return length - 3;
}
if (endsWith(text, length, "an")
&& (flags & REMOVED_DI) == 0
&& (flags & REMOVED_MENG) == 0
&& (flags & REMOVED_TER) == 0) {
numSyllables--;
return length - 2;
}
if (endsWith(text, length, "i")
&& !endsWith(text, length, "si")
&& (flags & REMOVED_BER) == 0
&& (flags & REMOVED_KE) == 0
&& (flags & REMOVED_PENG) == 0) {
numSyllables--;
return length - 1;
}
return length;
}
private boolean startsWith(char s[], int len, String prefix) {
final int prefixLen = prefix.length();
if (prefixLen > len)
return false;
for (int i = 0; i < prefixLen; i++)
if (s[i] != prefix.charAt(i))
return false;
return true;
}
private boolean endsWith(char s[], int len, String suffix) {
final int suffixLen = suffix.length();
if (suffixLen > len)
return false;
for (int i = suffixLen - 1; i >= 0; i--)
if (s[len -(suffixLen - i)] != suffix.charAt(i))
return false;
return true;
}
private int deleteN(char s[], int pos, int len, int nChars) {
for (int i = 0; i < nChars; i++)
len = delete(s, pos, len);
return len;
}
private int delete(char s[], int pos, int len) {
if (pos < len)
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
return len - 1;
}
}

View File

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Analyzer for Indonesian.
</body>
</html>

View File

@ -0,0 +1,359 @@
# from appendix D of: A Study of Stemming Effects on Information
# Retrieval in Bahasa Indonesia
ada
adanya
adalah
adapun
agak
agaknya
agar
akan
akankah
akhirnya
aku
akulah
amat
amatlah
anda
andalah
antar
diantaranya
antara
antaranya
diantara
apa
apaan
mengapa
apabila
apakah
apalagi
apatah
atau
ataukah
ataupun
bagai
bagaikan
sebagai
sebagainya
bagaimana
bagaimanapun
sebagaimana
bagaimanakah
bagi
bahkan
bahwa
bahwasanya
sebaliknya
banyak
sebanyak
beberapa
seberapa
begini
beginian
beginikah
beginilah
sebegini
begitu
begitukah
begitulah
begitupun
sebegitu
belum
belumlah
sebelum
sebelumnya
sebenarnya
berapa
berapakah
berapalah
berapapun
betulkah
sebetulnya
biasa
biasanya
bila
bilakah
bisa
bisakah
sebisanya
boleh
bolehkah
bolehlah
buat
bukan
bukankah
bukanlah
bukannya
cuma
percuma
dahulu
dalam
dan
dapat
dari
daripada
dekat
demi
demikian
demikianlah
sedemikian
dengan
depan
di
dia
dialah
dini
diri
dirinya
terdiri
dong
dulu
enggak
enggaknya
entah
entahlah
terhadap
terhadapnya
hal
hampir
hanya
hanyalah
harus
haruslah
harusnya
seharusnya
hendak
hendaklah
hendaknya
hingga
sehingga
ia
ialah
ibarat
ingin
inginkah
inginkan
ini
inikah
inilah
itu
itukah
itulah
jangan
jangankan
janganlah
jika
jikalau
juga
justru
kala
kalau
kalaulah
kalaupun
kalian
kami
kamilah
kamu
kamulah
kan
kapan
kapankah
kapanpun
dikarenakan
karena
karenanya
ke
kecil
kemudian
kenapa
kepada
kepadanya
ketika
seketika
khususnya
kini
kinilah
kiranya
sekiranya
kita
kitalah
kok
lagi
lagian
selagi
lah
lain
lainnya
melainkan
selaku
lalu
melalui
terlalu
lama
lamanya
selama
selama
selamanya
lebih
terlebih
bermacam
macam
semacam
maka
makanya
makin
malah
malahan
mampu
mampukah
mana
manakala
manalagi
masih
masihkah
semasih
masing
mau
maupun
semaunya
memang
mereka
merekalah
meski
meskipun
semula
mungkin
mungkinkah
nah
namun
nanti
nantinya
nyaris
oleh
olehnya
seorang
seseorang
pada
padanya
padahal
paling
sepanjang
pantas
sepantasnya
sepantasnyalah
para
pasti
pastilah
per
pernah
pula
pun
merupakan
rupanya
serupa
saat
saatnya
sesaat
saja
sajalah
saling
bersama
sama
sesama
sambil
sampai
sana
sangat
sangatlah
saya
sayalah
se
sebab
sebabnya
sebuah
tersebut
tersebutlah
sedang
sedangkan
sedikit
sedikitnya
segala
segalanya
segera
sesegera
sejak
sejenak
sekali
sekalian
sekalipun
sesekali
sekaligus
sekarang
sekarang
sekitar
sekitarnya
sela
selain
selalu
seluruh
seluruhnya
semakin
sementara
sempat
semua
semuanya
sendiri
sendirinya
seolah
seperti
sepertinya
sering
seringnya
serta
siapa
siapakah
siapapun
disini
disinilah
sini
sinilah
sesuatu
sesuatunya
suatu
sesudah
sesudahnya
sudah
sudahkah
sudahlah
supaya
tadi
tadinya
tak
tanpa
setelah
telah
tentang
tentu
tentulah
tentunya
tertentu
seterusnya
tapi
tetapi
setiap
tiap
setidaknya
tidak
tidakkah
tidaklah
toh
waduh
wah
wahai
sewaktu
walau
walaupun
wong
yaitu
yakni
yang

View File

@ -0,0 +1,53 @@
package org.apache.lucene.analysis.id;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
* stopwords file is missing in classpath */
public void testResourcesAvailable() {
new IndonesianAnalyzer(TEST_VERSION_CURRENT);
}
/** test stopwords and stemming */
public void testBasics() throws IOException {
Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "peledakan", "ledak");
checkOneTermReuse(a, "pembunuhan", "bunuh");
// stopword
assertAnalyzesTo(a, "bahwa", new String[] {});
}
/** test use of exclusion set */
public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>();
exclusionSet.add("peledakan");
Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT,
IndonesianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "peledakan", "peledakan");
checkOneTermReuse(a, "pembunuhan", "bunuh");
}
}

View File

@ -0,0 +1,136 @@
package org.apache.lucene.analysis.id;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.KeywordTokenizer;
import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.Tokenizer;
/**
* Tests {@link IndonesianStemmer}
*/
public class TestIndonesianStemmer extends BaseTokenStreamTestCase {
/* full stemming, no stopwords */
Analyzer a = new ReusableAnalyzerBase() {
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer));
}
};
/** Some examples from the paper */
public void testExamples() throws IOException {
checkOneTerm(a, "bukukah", "buku");
checkOneTermReuse(a, "adalah", "ada");
checkOneTermReuse(a, "bukupun", "buku");
checkOneTermReuse(a, "bukuku", "buku");
checkOneTermReuse(a, "bukumu", "buku");
checkOneTermReuse(a, "bukunya", "buku");
checkOneTermReuse(a, "mengukur", "ukur");
checkOneTermReuse(a, "menyapu", "sapu");
checkOneTermReuse(a, "menduga", "duga");
checkOneTermReuse(a, "menuduh", "uduh");
checkOneTermReuse(a, "membaca", "baca");
checkOneTermReuse(a, "merusak", "rusak");
checkOneTermReuse(a, "pengukur", "ukur");
checkOneTermReuse(a, "penyapu", "sapu");
checkOneTermReuse(a, "penduga", "duga");
checkOneTermReuse(a, "pembaca", "baca");
checkOneTermReuse(a, "diukur", "ukur");
checkOneTermReuse(a, "tersapu", "sapu");
checkOneTermReuse(a, "kekasih", "kasih");
checkOneTermReuse(a, "berlari", "lari");
checkOneTermReuse(a, "belajar", "ajar");
checkOneTermReuse(a, "bekerja", "kerja");
checkOneTermReuse(a, "perjelas", "jelas");
checkOneTermReuse(a, "pelajar", "ajar");
checkOneTermReuse(a, "pekerja", "kerja");
checkOneTermReuse(a, "tarikkan", "tarik");
checkOneTermReuse(a, "ambilkan", "ambil");
checkOneTermReuse(a, "mengambilkan", "ambil");
checkOneTermReuse(a, "makanan", "makan");
checkOneTermReuse(a, "janjian", "janji");
checkOneTermReuse(a, "perjanjian", "janji");
checkOneTermReuse(a, "tandai", "tanda");
checkOneTermReuse(a, "dapati", "dapat");
checkOneTermReuse(a, "mendapati", "dapat");
checkOneTermReuse(a, "pantai", "panta");
}
/** Some detailed analysis examples (that might not be the best) */
public void testIRExamples() throws IOException {
checkOneTerm(a, "penyalahgunaan", "salahguna");
checkOneTermReuse(a, "menyalahgunakan", "salahguna");
checkOneTermReuse(a, "disalahgunakan", "salahguna");
checkOneTermReuse(a, "pertanggungjawaban", "tanggungjawab");
checkOneTermReuse(a, "mempertanggungjawabkan", "tanggungjawab");
checkOneTermReuse(a, "dipertanggungjawabkan", "tanggungjawab");
checkOneTermReuse(a, "pelaksanaan", "laksana");
checkOneTermReuse(a, "pelaksana", "laksana");
checkOneTermReuse(a, "melaksanakan", "laksana");
checkOneTermReuse(a, "dilaksanakan", "laksana");
checkOneTermReuse(a, "melibatkan", "libat");
checkOneTermReuse(a, "terlibat", "libat");
checkOneTermReuse(a, "penculikan", "culik");
checkOneTermReuse(a, "menculik", "culik");
checkOneTermReuse(a, "diculik", "culik");
checkOneTermReuse(a, "penculik", "culik");
checkOneTermReuse(a, "perubahan", "ubah");
checkOneTermReuse(a, "peledakan", "ledak");
checkOneTermReuse(a, "penanganan", "tangan");
checkOneTermReuse(a, "kepolisian", "polisi");
checkOneTermReuse(a, "kenaikan", "naik");
checkOneTermReuse(a, "bersenjata", "senjata");
checkOneTermReuse(a, "penyelewengan", "seleweng");
checkOneTermReuse(a, "kecelakaan", "celaka");
}
/* inflectional-only stemming */
Analyzer b = new ReusableAnalyzerBase() {
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer, false));
}
};
/** Test stemming only inflectional suffixes */
public void testInflectionalOnly() throws IOException {
checkOneTerm(b, "bukunya", "buku");
checkOneTermReuse(b, "bukukah", "buku");
checkOneTermReuse(b, "bukunyakah", "buku");
checkOneTermReuse(b, "dibukukannya", "dibukukan");
}
public void testShouldntStem() throws IOException {
checkOneTerm(a, "bersenjata", "senjata");
checkOneTermReuse(a, "bukukah", "buku");
checkOneTermReuse(a, "gigi", "gigi");
}
}

View File

@ -0,0 +1,37 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.id.IndonesianStemFilter;
/** Factory for {@link IndonesianStemFilter} */
public class IndonesianStemFilterFactory extends BaseTokenFilterFactory {
private boolean stemDerivational = true;
public void init(Map<String, String> args) {
super.init(args);
stemDerivational = getBoolean("stemDerivational", true);
}
public TokenStream create(TokenStream input) {
return new IndonesianStemFilter(input, stemDerivational);
}
}

View File

@ -0,0 +1,59 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the Indonesian stem filter factory is working.
*/
public class TestIndonesianStemFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually stems text.
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("dibukukannya");
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
IndonesianStemFilterFactory factory = new IndonesianStemFilterFactory();
Map<String,String> args = new HashMap<String,String>();
factory.init(args);
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "buku" });
}
/**
* Test inflectional-only mode
*/
public void testStemmingInflectional() throws Exception {
Reader reader = new StringReader("dibukukannya");
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
IndonesianStemFilterFactory factory = new IndonesianStemFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("stemDerivational", "false");
factory.init(args);
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "dibukukan" });
}
}