mirror of https://github.com/apache/lucene.git
LUCENE-2437: Indonesian Analyzer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@942235 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
37a081173a
commit
1b020be130
|
@ -152,6 +152,8 @@ New features
|
|||
of AttributeSource.cloneAttributes() instances and the new copyTo() method.
|
||||
(Steven Rowe via Uwe Schindler)
|
||||
|
||||
* LUCENE-2437: Add an Analyzer for Indonesian. (Robert Muir)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||
|
|
|
@ -0,0 +1,130 @@
|
|||
package org.apache.lucene.analysis.id;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Analyzer for Indonesian (Bahasa)
|
||||
*/
|
||||
public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
||||
/** File containing default Indonesian stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = loadStopwordSet(false, IndonesianAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final Set<?> stemExclusionSet;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public IndonesianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public IndonesianAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop word. If a none-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* {@link IndonesianStemFilter}.
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
* @param stemExclusionSet
|
||||
* a set of terms not to be stemmed
|
||||
*/
|
||||
public IndonesianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet){
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates
|
||||
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||
* used to tokenize all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, {@link KeywordMarkerFilter}
|
||||
* if a stem exclusion set is provided and {@link IndonesianStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, source);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if (!stemExclusionSet.isEmpty()) {
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
}
|
||||
return new TokenStreamComponents(source, new IndonesianStemFilter(result));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
package org.apache.lucene.analysis.id;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that applies {@link IndonesianStemmer} to stem Indonesian words.
|
||||
*/
|
||||
public final class IndonesianStemFilter extends TokenFilter {
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||
private final IndonesianStemmer stemmer = new IndonesianStemmer();
|
||||
private final boolean stemDerivational;
|
||||
|
||||
/**
|
||||
* Calls {@link #IndonesianStemFilter(TokenStream, boolean) IndonesianStemFilter(input, true)}
|
||||
*/
|
||||
public IndonesianStemFilter(TokenStream input) {
|
||||
this(input, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new IndonesianStemFilter.
|
||||
* <p>
|
||||
* If <code>stemDerivational</code> is false,
|
||||
* only inflectional suffixes (particles and possessive pronouns) are stemmed.
|
||||
*/
|
||||
public IndonesianStemFilter(TokenStream input, boolean stemDerivational) {
|
||||
super(input);
|
||||
this.stemDerivational = stemDerivational;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if(!keywordAtt.isKeyword()) {
|
||||
final int newlen =
|
||||
stemmer.stem(termAtt.buffer(), termAtt.length(), stemDerivational);
|
||||
termAtt.setLength(newlen);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,304 @@
|
|||
package org.apache.lucene.analysis.id;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Stemmer for Indonesian.
|
||||
* <p>
|
||||
* Stems Indonesian words with the algorithm presented in:
|
||||
* <i>A Study of Stemming Effects on Information Retrieval in
|
||||
* Bahasa Indonesia</i>, Fadillah Z Tala.
|
||||
* http://www.illc.uva.nl/Publications/ResearchReports/MoL-2003-02.text.pdf
|
||||
*/
|
||||
public class IndonesianStemmer {
|
||||
private int numSyllables;
|
||||
private int flags;
|
||||
private static final int REMOVED_KE = 1;
|
||||
private static final int REMOVED_PENG = 2;
|
||||
private static final int REMOVED_DI = 4;
|
||||
private static final int REMOVED_MENG = 8;
|
||||
private static final int REMOVED_TER = 16;
|
||||
private static final int REMOVED_BER = 32;
|
||||
private static final int REMOVED_PE = 64;
|
||||
|
||||
/**
|
||||
* Stem a term (returning its new length).
|
||||
* <p>
|
||||
* Use <code>stemDerivational</code> to control whether full stemming
|
||||
* or only light inflectional stemming is done.
|
||||
*/
|
||||
public int stem(char text[], int length, boolean stemDerivational) {
|
||||
flags = 0;
|
||||
numSyllables = 0;
|
||||
for (int i = 0; i < length; i++)
|
||||
if (isVowel(text[i]))
|
||||
numSyllables++;
|
||||
|
||||
if (numSyllables > 2) length = removeParticle(text, length);
|
||||
if (numSyllables > 2) length = removePossessivePronoun(text, length);
|
||||
|
||||
if (stemDerivational)
|
||||
length = stemDerivational(text, length);
|
||||
return length;
|
||||
}
|
||||
|
||||
private int stemDerivational(char text[], int length) {
|
||||
int oldLength = length;
|
||||
if (numSyllables > 2) length = removeFirstOrderPrefix(text, length);
|
||||
if (oldLength != length) { // a rule is fired
|
||||
oldLength = length;
|
||||
if (numSyllables > 2) length = removeSuffix(text, length);
|
||||
if (oldLength != length) // a rule is fired
|
||||
if (numSyllables > 2) length = removeSecondOrderPrefix(text, length);
|
||||
} else { // fail
|
||||
if (numSyllables > 2) length = removeSecondOrderPrefix(text, length);
|
||||
if (numSyllables > 2) length = removeSuffix(text, length);
|
||||
}
|
||||
return length;
|
||||
}
|
||||
|
||||
private boolean isVowel(char ch) {
|
||||
switch(ch) {
|
||||
case 'a':
|
||||
case 'e':
|
||||
case 'i':
|
||||
case 'o':
|
||||
case 'u':
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private int removeParticle(char text[], int length) {
|
||||
if (endsWith(text, length, "kah") ||
|
||||
endsWith(text, length, "lah") ||
|
||||
endsWith(text, length, "pun")) {
|
||||
numSyllables--;
|
||||
return length - 3;
|
||||
}
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
private int removePossessivePronoun(char text[], int length) {
|
||||
if (endsWith(text, length, "ku") || endsWith(text, length, "mu")) {
|
||||
numSyllables--;
|
||||
return length - 2;
|
||||
}
|
||||
|
||||
if (endsWith(text, length, "nya")) {
|
||||
numSyllables--;
|
||||
return length - 3;
|
||||
}
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
private int removeFirstOrderPrefix(char text[], int length) {
|
||||
if (startsWith(text, length, "meng")) {
|
||||
flags |= REMOVED_MENG;
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 4);
|
||||
}
|
||||
|
||||
if (startsWith(text, length, "meny") && length > 4 && isVowel(text[4])) {
|
||||
flags |= REMOVED_MENG;
|
||||
text[3] = 's';
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 3);
|
||||
}
|
||||
|
||||
if (startsWith(text, length, "men")) {
|
||||
flags |= REMOVED_MENG;
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 3);
|
||||
}
|
||||
|
||||
if (startsWith(text, length, "mem")) {
|
||||
flags |= REMOVED_MENG;
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 3);
|
||||
}
|
||||
|
||||
if (startsWith(text, length, "me")) {
|
||||
flags |= REMOVED_MENG;
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 2);
|
||||
}
|
||||
|
||||
if (startsWith(text, length, "peng")) {
|
||||
flags |= REMOVED_PENG;
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 4);
|
||||
}
|
||||
|
||||
if (startsWith(text, length, "peny") && length > 4 && isVowel(text[4])) {
|
||||
flags |= REMOVED_PENG;
|
||||
text[3] = 's';
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 3);
|
||||
}
|
||||
|
||||
if (startsWith(text, length, "peny")) {
|
||||
flags |= REMOVED_PENG;
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 4);
|
||||
}
|
||||
|
||||
if (startsWith(text, length, "pen") && length > 3 && isVowel(text[3])) {
|
||||
flags |= REMOVED_PENG;
|
||||
text[2] = 't';
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 2);
|
||||
}
|
||||
|
||||
if (startsWith(text, length, "pen")) {
|
||||
flags |= REMOVED_PENG;
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 3);
|
||||
}
|
||||
|
||||
if (startsWith(text, length, "pem")) {
|
||||
flags |= REMOVED_PENG;
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 3);
|
||||
}
|
||||
|
||||
if (startsWith(text, length, "di")) {
|
||||
flags |= REMOVED_DI;
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 2);
|
||||
}
|
||||
|
||||
if (startsWith(text, length, "ter")) {
|
||||
flags |= REMOVED_TER;
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 3);
|
||||
}
|
||||
|
||||
if (startsWith(text, length, "ke")) {
|
||||
flags |= REMOVED_KE;
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 2);
|
||||
}
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
private int removeSecondOrderPrefix(char text[], int length) {
|
||||
if (startsWith(text, length, "ber")) {
|
||||
flags |= REMOVED_BER;
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 3);
|
||||
}
|
||||
|
||||
if (length == 7 && startsWith(text, length, "belajar")) {
|
||||
flags |= REMOVED_BER;
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 3);
|
||||
}
|
||||
|
||||
if (startsWith(text, length, "be") && length > 4
|
||||
&& !isVowel(text[2]) && text[3] == 'e' && text[4] == 'r') {
|
||||
flags |= REMOVED_BER;
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 2);
|
||||
}
|
||||
|
||||
if (startsWith(text, length, "per")) {
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 3);
|
||||
}
|
||||
|
||||
if (length == 7 && startsWith(text, length, "pelajar")) {
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 3);
|
||||
}
|
||||
|
||||
if (startsWith(text, length, "pe")) {
|
||||
flags |= REMOVED_PE;
|
||||
numSyllables--;
|
||||
return deleteN(text, 0, length, 2);
|
||||
}
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
private int removeSuffix(char text[], int length) {
|
||||
if (endsWith(text, length, "kan")
|
||||
&& (flags & REMOVED_KE) == 0
|
||||
&& (flags & REMOVED_PENG) == 0
|
||||
&& (flags & REMOVED_PE) == 0) {
|
||||
numSyllables--;
|
||||
return length - 3;
|
||||
}
|
||||
|
||||
if (endsWith(text, length, "an")
|
||||
&& (flags & REMOVED_DI) == 0
|
||||
&& (flags & REMOVED_MENG) == 0
|
||||
&& (flags & REMOVED_TER) == 0) {
|
||||
numSyllables--;
|
||||
return length - 2;
|
||||
}
|
||||
|
||||
if (endsWith(text, length, "i")
|
||||
&& !endsWith(text, length, "si")
|
||||
&& (flags & REMOVED_BER) == 0
|
||||
&& (flags & REMOVED_KE) == 0
|
||||
&& (flags & REMOVED_PENG) == 0) {
|
||||
numSyllables--;
|
||||
return length - 1;
|
||||
}
|
||||
return length;
|
||||
}
|
||||
|
||||
private boolean startsWith(char s[], int len, String prefix) {
|
||||
final int prefixLen = prefix.length();
|
||||
if (prefixLen > len)
|
||||
return false;
|
||||
for (int i = 0; i < prefixLen; i++)
|
||||
if (s[i] != prefix.charAt(i))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean endsWith(char s[], int len, String suffix) {
|
||||
final int suffixLen = suffix.length();
|
||||
if (suffixLen > len)
|
||||
return false;
|
||||
for (int i = suffixLen - 1; i >= 0; i--)
|
||||
if (s[len -(suffixLen - i)] != suffix.charAt(i))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private int deleteN(char s[], int pos, int len, int nChars) {
|
||||
for (int i = 0; i < nChars; i++)
|
||||
len = delete(s, pos, len);
|
||||
return len;
|
||||
}
|
||||
|
||||
private int delete(char s[], int pos, int len) {
|
||||
if (pos < len)
|
||||
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
|
||||
|
||||
return len - 1;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Analyzer for Indonesian.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,359 @@
|
|||
# from appendix D of: A Study of Stemming Effects on Information
|
||||
# Retrieval in Bahasa Indonesia
|
||||
ada
|
||||
adanya
|
||||
adalah
|
||||
adapun
|
||||
agak
|
||||
agaknya
|
||||
agar
|
||||
akan
|
||||
akankah
|
||||
akhirnya
|
||||
aku
|
||||
akulah
|
||||
amat
|
||||
amatlah
|
||||
anda
|
||||
andalah
|
||||
antar
|
||||
diantaranya
|
||||
antara
|
||||
antaranya
|
||||
diantara
|
||||
apa
|
||||
apaan
|
||||
mengapa
|
||||
apabila
|
||||
apakah
|
||||
apalagi
|
||||
apatah
|
||||
atau
|
||||
ataukah
|
||||
ataupun
|
||||
bagai
|
||||
bagaikan
|
||||
sebagai
|
||||
sebagainya
|
||||
bagaimana
|
||||
bagaimanapun
|
||||
sebagaimana
|
||||
bagaimanakah
|
||||
bagi
|
||||
bahkan
|
||||
bahwa
|
||||
bahwasanya
|
||||
sebaliknya
|
||||
banyak
|
||||
sebanyak
|
||||
beberapa
|
||||
seberapa
|
||||
begini
|
||||
beginian
|
||||
beginikah
|
||||
beginilah
|
||||
sebegini
|
||||
begitu
|
||||
begitukah
|
||||
begitulah
|
||||
begitupun
|
||||
sebegitu
|
||||
belum
|
||||
belumlah
|
||||
sebelum
|
||||
sebelumnya
|
||||
sebenarnya
|
||||
berapa
|
||||
berapakah
|
||||
berapalah
|
||||
berapapun
|
||||
betulkah
|
||||
sebetulnya
|
||||
biasa
|
||||
biasanya
|
||||
bila
|
||||
bilakah
|
||||
bisa
|
||||
bisakah
|
||||
sebisanya
|
||||
boleh
|
||||
bolehkah
|
||||
bolehlah
|
||||
buat
|
||||
bukan
|
||||
bukankah
|
||||
bukanlah
|
||||
bukannya
|
||||
cuma
|
||||
percuma
|
||||
dahulu
|
||||
dalam
|
||||
dan
|
||||
dapat
|
||||
dari
|
||||
daripada
|
||||
dekat
|
||||
demi
|
||||
demikian
|
||||
demikianlah
|
||||
sedemikian
|
||||
dengan
|
||||
depan
|
||||
di
|
||||
dia
|
||||
dialah
|
||||
dini
|
||||
diri
|
||||
dirinya
|
||||
terdiri
|
||||
dong
|
||||
dulu
|
||||
enggak
|
||||
enggaknya
|
||||
entah
|
||||
entahlah
|
||||
terhadap
|
||||
terhadapnya
|
||||
hal
|
||||
hampir
|
||||
hanya
|
||||
hanyalah
|
||||
harus
|
||||
haruslah
|
||||
harusnya
|
||||
seharusnya
|
||||
hendak
|
||||
hendaklah
|
||||
hendaknya
|
||||
hingga
|
||||
sehingga
|
||||
ia
|
||||
ialah
|
||||
ibarat
|
||||
ingin
|
||||
inginkah
|
||||
inginkan
|
||||
ini
|
||||
inikah
|
||||
inilah
|
||||
itu
|
||||
itukah
|
||||
itulah
|
||||
jangan
|
||||
jangankan
|
||||
janganlah
|
||||
jika
|
||||
jikalau
|
||||
juga
|
||||
justru
|
||||
kala
|
||||
kalau
|
||||
kalaulah
|
||||
kalaupun
|
||||
kalian
|
||||
kami
|
||||
kamilah
|
||||
kamu
|
||||
kamulah
|
||||
kan
|
||||
kapan
|
||||
kapankah
|
||||
kapanpun
|
||||
dikarenakan
|
||||
karena
|
||||
karenanya
|
||||
ke
|
||||
kecil
|
||||
kemudian
|
||||
kenapa
|
||||
kepada
|
||||
kepadanya
|
||||
ketika
|
||||
seketika
|
||||
khususnya
|
||||
kini
|
||||
kinilah
|
||||
kiranya
|
||||
sekiranya
|
||||
kita
|
||||
kitalah
|
||||
kok
|
||||
lagi
|
||||
lagian
|
||||
selagi
|
||||
lah
|
||||
lain
|
||||
lainnya
|
||||
melainkan
|
||||
selaku
|
||||
lalu
|
||||
melalui
|
||||
terlalu
|
||||
lama
|
||||
lamanya
|
||||
selama
|
||||
selama
|
||||
selamanya
|
||||
lebih
|
||||
terlebih
|
||||
bermacam
|
||||
macam
|
||||
semacam
|
||||
maka
|
||||
makanya
|
||||
makin
|
||||
malah
|
||||
malahan
|
||||
mampu
|
||||
mampukah
|
||||
mana
|
||||
manakala
|
||||
manalagi
|
||||
masih
|
||||
masihkah
|
||||
semasih
|
||||
masing
|
||||
mau
|
||||
maupun
|
||||
semaunya
|
||||
memang
|
||||
mereka
|
||||
merekalah
|
||||
meski
|
||||
meskipun
|
||||
semula
|
||||
mungkin
|
||||
mungkinkah
|
||||
nah
|
||||
namun
|
||||
nanti
|
||||
nantinya
|
||||
nyaris
|
||||
oleh
|
||||
olehnya
|
||||
seorang
|
||||
seseorang
|
||||
pada
|
||||
padanya
|
||||
padahal
|
||||
paling
|
||||
sepanjang
|
||||
pantas
|
||||
sepantasnya
|
||||
sepantasnyalah
|
||||
para
|
||||
pasti
|
||||
pastilah
|
||||
per
|
||||
pernah
|
||||
pula
|
||||
pun
|
||||
merupakan
|
||||
rupanya
|
||||
serupa
|
||||
saat
|
||||
saatnya
|
||||
sesaat
|
||||
saja
|
||||
sajalah
|
||||
saling
|
||||
bersama
|
||||
sama
|
||||
sesama
|
||||
sambil
|
||||
sampai
|
||||
sana
|
||||
sangat
|
||||
sangatlah
|
||||
saya
|
||||
sayalah
|
||||
se
|
||||
sebab
|
||||
sebabnya
|
||||
sebuah
|
||||
tersebut
|
||||
tersebutlah
|
||||
sedang
|
||||
sedangkan
|
||||
sedikit
|
||||
sedikitnya
|
||||
segala
|
||||
segalanya
|
||||
segera
|
||||
sesegera
|
||||
sejak
|
||||
sejenak
|
||||
sekali
|
||||
sekalian
|
||||
sekalipun
|
||||
sesekali
|
||||
sekaligus
|
||||
sekarang
|
||||
sekarang
|
||||
sekitar
|
||||
sekitarnya
|
||||
sela
|
||||
selain
|
||||
selalu
|
||||
seluruh
|
||||
seluruhnya
|
||||
semakin
|
||||
sementara
|
||||
sempat
|
||||
semua
|
||||
semuanya
|
||||
sendiri
|
||||
sendirinya
|
||||
seolah
|
||||
seperti
|
||||
sepertinya
|
||||
sering
|
||||
seringnya
|
||||
serta
|
||||
siapa
|
||||
siapakah
|
||||
siapapun
|
||||
disini
|
||||
disinilah
|
||||
sini
|
||||
sinilah
|
||||
sesuatu
|
||||
sesuatunya
|
||||
suatu
|
||||
sesudah
|
||||
sesudahnya
|
||||
sudah
|
||||
sudahkah
|
||||
sudahlah
|
||||
supaya
|
||||
tadi
|
||||
tadinya
|
||||
tak
|
||||
tanpa
|
||||
setelah
|
||||
telah
|
||||
tentang
|
||||
tentu
|
||||
tentulah
|
||||
tentunya
|
||||
tertentu
|
||||
seterusnya
|
||||
tapi
|
||||
tetapi
|
||||
setiap
|
||||
tiap
|
||||
setidaknya
|
||||
tidak
|
||||
tidakkah
|
||||
tidaklah
|
||||
toh
|
||||
waduh
|
||||
wah
|
||||
wahai
|
||||
sewaktu
|
||||
walau
|
||||
walaupun
|
||||
wong
|
||||
yaitu
|
||||
yakni
|
||||
yang
|
|
@ -0,0 +1,53 @@
|
|||
package org.apache.lucene.analysis.id;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
||||
public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
* stopwords file is missing in classpath */
|
||||
public void testResourcesAvailable() {
|
||||
new IndonesianAnalyzer(TEST_VERSION_CURRENT);
|
||||
}
|
||||
|
||||
/** test stopwords and stemming */
|
||||
public void testBasics() throws IOException {
|
||||
Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "peledakan", "ledak");
|
||||
checkOneTermReuse(a, "pembunuhan", "bunuh");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "bahwa", new String[] {});
|
||||
}
|
||||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("peledakan");
|
||||
Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT,
|
||||
IndonesianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "peledakan", "peledakan");
|
||||
checkOneTermReuse(a, "pembunuhan", "bunuh");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,136 @@
|
|||
package org.apache.lucene.analysis.id;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/**
|
||||
* Tests {@link IndonesianStemmer}
|
||||
*/
|
||||
public class TestIndonesianStemmer extends BaseTokenStreamTestCase {
|
||||
/* full stemming, no stopwords */
|
||||
Analyzer a = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
|
||||
/** Some examples from the paper */
|
||||
public void testExamples() throws IOException {
|
||||
checkOneTerm(a, "bukukah", "buku");
|
||||
checkOneTermReuse(a, "adalah", "ada");
|
||||
checkOneTermReuse(a, "bukupun", "buku");
|
||||
checkOneTermReuse(a, "bukuku", "buku");
|
||||
checkOneTermReuse(a, "bukumu", "buku");
|
||||
checkOneTermReuse(a, "bukunya", "buku");
|
||||
checkOneTermReuse(a, "mengukur", "ukur");
|
||||
checkOneTermReuse(a, "menyapu", "sapu");
|
||||
checkOneTermReuse(a, "menduga", "duga");
|
||||
checkOneTermReuse(a, "menuduh", "uduh");
|
||||
checkOneTermReuse(a, "membaca", "baca");
|
||||
checkOneTermReuse(a, "merusak", "rusak");
|
||||
checkOneTermReuse(a, "pengukur", "ukur");
|
||||
checkOneTermReuse(a, "penyapu", "sapu");
|
||||
checkOneTermReuse(a, "penduga", "duga");
|
||||
checkOneTermReuse(a, "pembaca", "baca");
|
||||
checkOneTermReuse(a, "diukur", "ukur");
|
||||
checkOneTermReuse(a, "tersapu", "sapu");
|
||||
checkOneTermReuse(a, "kekasih", "kasih");
|
||||
checkOneTermReuse(a, "berlari", "lari");
|
||||
checkOneTermReuse(a, "belajar", "ajar");
|
||||
checkOneTermReuse(a, "bekerja", "kerja");
|
||||
checkOneTermReuse(a, "perjelas", "jelas");
|
||||
checkOneTermReuse(a, "pelajar", "ajar");
|
||||
checkOneTermReuse(a, "pekerja", "kerja");
|
||||
checkOneTermReuse(a, "tarikkan", "tarik");
|
||||
checkOneTermReuse(a, "ambilkan", "ambil");
|
||||
checkOneTermReuse(a, "mengambilkan", "ambil");
|
||||
checkOneTermReuse(a, "makanan", "makan");
|
||||
checkOneTermReuse(a, "janjian", "janji");
|
||||
checkOneTermReuse(a, "perjanjian", "janji");
|
||||
checkOneTermReuse(a, "tandai", "tanda");
|
||||
checkOneTermReuse(a, "dapati", "dapat");
|
||||
checkOneTermReuse(a, "mendapati", "dapat");
|
||||
checkOneTermReuse(a, "pantai", "panta");
|
||||
}
|
||||
|
||||
/** Some detailed analysis examples (that might not be the best) */
|
||||
public void testIRExamples() throws IOException {
|
||||
checkOneTerm(a, "penyalahgunaan", "salahguna");
|
||||
checkOneTermReuse(a, "menyalahgunakan", "salahguna");
|
||||
checkOneTermReuse(a, "disalahgunakan", "salahguna");
|
||||
|
||||
checkOneTermReuse(a, "pertanggungjawaban", "tanggungjawab");
|
||||
checkOneTermReuse(a, "mempertanggungjawabkan", "tanggungjawab");
|
||||
checkOneTermReuse(a, "dipertanggungjawabkan", "tanggungjawab");
|
||||
|
||||
checkOneTermReuse(a, "pelaksanaan", "laksana");
|
||||
checkOneTermReuse(a, "pelaksana", "laksana");
|
||||
checkOneTermReuse(a, "melaksanakan", "laksana");
|
||||
checkOneTermReuse(a, "dilaksanakan", "laksana");
|
||||
|
||||
checkOneTermReuse(a, "melibatkan", "libat");
|
||||
checkOneTermReuse(a, "terlibat", "libat");
|
||||
|
||||
checkOneTermReuse(a, "penculikan", "culik");
|
||||
checkOneTermReuse(a, "menculik", "culik");
|
||||
checkOneTermReuse(a, "diculik", "culik");
|
||||
checkOneTermReuse(a, "penculik", "culik");
|
||||
|
||||
checkOneTermReuse(a, "perubahan", "ubah");
|
||||
checkOneTermReuse(a, "peledakan", "ledak");
|
||||
checkOneTermReuse(a, "penanganan", "tangan");
|
||||
checkOneTermReuse(a, "kepolisian", "polisi");
|
||||
checkOneTermReuse(a, "kenaikan", "naik");
|
||||
checkOneTermReuse(a, "bersenjata", "senjata");
|
||||
checkOneTermReuse(a, "penyelewengan", "seleweng");
|
||||
checkOneTermReuse(a, "kecelakaan", "celaka");
|
||||
}
|
||||
|
||||
/* inflectional-only stemming */
|
||||
Analyzer b = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer, false));
|
||||
}
|
||||
};
|
||||
|
||||
/** Test stemming only inflectional suffixes */
|
||||
public void testInflectionalOnly() throws IOException {
|
||||
checkOneTerm(b, "bukunya", "buku");
|
||||
checkOneTermReuse(b, "bukukah", "buku");
|
||||
checkOneTermReuse(b, "bukunyakah", "buku");
|
||||
checkOneTermReuse(b, "dibukukannya", "dibukukan");
|
||||
}
|
||||
|
||||
public void testShouldntStem() throws IOException {
|
||||
checkOneTerm(a, "bersenjata", "senjata");
|
||||
checkOneTermReuse(a, "bukukah", "buku");
|
||||
checkOneTermReuse(a, "gigi", "gigi");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.id.IndonesianStemFilter;
|
||||
|
||||
/** Factory for {@link IndonesianStemFilter} */
|
||||
public class IndonesianStemFilterFactory extends BaseTokenFilterFactory {
|
||||
private boolean stemDerivational = true;
|
||||
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
stemDerivational = getBoolean("stemDerivational", true);
|
||||
}
|
||||
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new IndonesianStemFilter(input, stemDerivational);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Indonesian stem filter factory is working.
|
||||
*/
|
||||
public class TestIndonesianStemFilterFactory extends BaseTokenTestCase {
|
||||
/**
|
||||
* Ensure the filter actually stems text.
|
||||
*/
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("dibukukannya");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||
IndonesianStemFilterFactory factory = new IndonesianStemFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "buku" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test inflectional-only mode
|
||||
*/
|
||||
public void testStemmingInflectional() throws Exception {
|
||||
Reader reader = new StringReader("dibukukannya");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||
IndonesianStemFilterFactory factory = new IndonesianStemFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("stemDerivational", "false");
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "dibukukan" });
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue