mirror of https://github.com/apache/lucene.git
LUCENE-2437: Indonesian Analyzer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@942235 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
37a081173a
commit
1b020be130
|
@ -152,6 +152,8 @@ New features
|
||||||
of AttributeSource.cloneAttributes() instances and the new copyTo() method.
|
of AttributeSource.cloneAttributes() instances and the new copyTo() method.
|
||||||
(Steven Rowe via Uwe Schindler)
|
(Steven Rowe via Uwe Schindler)
|
||||||
|
|
||||||
|
* LUCENE-2437: Add an Analyzer for Indonesian. (Robert Muir)
|
||||||
|
|
||||||
Build
|
Build
|
||||||
|
|
||||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||||
|
|
|
@ -0,0 +1,130 @@
|
||||||
|
package org.apache.lucene.analysis.id;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyzer for Indonesian (Bahasa)
|
||||||
|
*/
|
||||||
|
public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
/** File containing default Indonesian stopwords. */
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop-words set.
|
||||||
|
* @return an unmodifiable instance of the default stop-words set.
|
||||||
|
*/
|
||||||
|
public static Set<?> getDefaultStopSet(){
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||||
|
* accesses the static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final Set<?> DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET = loadStopwordSet(false, IndonesianAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final Set<?> stemExclusionSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||||
|
*/
|
||||||
|
public IndonesianAnalyzer(Version matchVersion) {
|
||||||
|
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words
|
||||||
|
*
|
||||||
|
* @param matchVersion
|
||||||
|
* lucene compatibility version
|
||||||
|
* @param stopwords
|
||||||
|
* a stopword set
|
||||||
|
*/
|
||||||
|
public IndonesianAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||||
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop word. If a none-empty stem exclusion set is
|
||||||
|
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||||
|
* {@link IndonesianStemFilter}.
|
||||||
|
*
|
||||||
|
* @param matchVersion
|
||||||
|
* lucene compatibility version
|
||||||
|
* @param stopwords
|
||||||
|
* a stopword set
|
||||||
|
* @param stemExclusionSet
|
||||||
|
* a set of terms not to be stemmed
|
||||||
|
*/
|
||||||
|
public IndonesianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet){
|
||||||
|
super(matchVersion, stopwords);
|
||||||
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
|
matchVersion, stemExclusionSet));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates
|
||||||
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* used to tokenize all the text in the provided {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* built from an {@link StandardTokenizer} filtered with
|
||||||
|
* {@link StandardFilter}, {@link LowerCaseFilter},
|
||||||
|
* {@link StopFilter}, {@link KeywordMarkerFilter}
|
||||||
|
* if a stem exclusion set is provided and {@link IndonesianStemFilter}.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||||
|
TokenStream result = new StandardFilter(source);
|
||||||
|
result = new LowerCaseFilter(matchVersion, source);
|
||||||
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
|
if (!stemExclusionSet.isEmpty()) {
|
||||||
|
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||||
|
}
|
||||||
|
return new TokenStreamComponents(source, new IndonesianStemFilter(result));
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,67 @@
|
||||||
|
package org.apache.lucene.analysis.id;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link IndonesianStemmer} to stem Indonesian words.
|
||||||
|
*/
|
||||||
|
public final class IndonesianStemFilter extends TokenFilter {
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||||
|
private final IndonesianStemmer stemmer = new IndonesianStemmer();
|
||||||
|
private final boolean stemDerivational;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calls {@link #IndonesianStemFilter(TokenStream, boolean) IndonesianStemFilter(input, true)}
|
||||||
|
*/
|
||||||
|
public IndonesianStemFilter(TokenStream input) {
|
||||||
|
this(input, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new IndonesianStemFilter.
|
||||||
|
* <p>
|
||||||
|
* If <code>stemDerivational</code> is false,
|
||||||
|
* only inflectional suffixes (particles and possessive pronouns) are stemmed.
|
||||||
|
*/
|
||||||
|
public IndonesianStemFilter(TokenStream input, boolean stemDerivational) {
|
||||||
|
super(input);
|
||||||
|
this.stemDerivational = stemDerivational;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if(!keywordAtt.isKeyword()) {
|
||||||
|
final int newlen =
|
||||||
|
stemmer.stem(termAtt.buffer(), termAtt.length(), stemDerivational);
|
||||||
|
termAtt.setLength(newlen);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,304 @@
|
||||||
|
package org.apache.lucene.analysis.id;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stemmer for Indonesian.
|
||||||
|
* <p>
|
||||||
|
* Stems Indonesian words with the algorithm presented in:
|
||||||
|
* <i>A Study of Stemming Effects on Information Retrieval in
|
||||||
|
* Bahasa Indonesia</i>, Fadillah Z Tala.
|
||||||
|
* http://www.illc.uva.nl/Publications/ResearchReports/MoL-2003-02.text.pdf
|
||||||
|
*/
|
||||||
|
public class IndonesianStemmer {
|
||||||
|
private int numSyllables;
|
||||||
|
private int flags;
|
||||||
|
private static final int REMOVED_KE = 1;
|
||||||
|
private static final int REMOVED_PENG = 2;
|
||||||
|
private static final int REMOVED_DI = 4;
|
||||||
|
private static final int REMOVED_MENG = 8;
|
||||||
|
private static final int REMOVED_TER = 16;
|
||||||
|
private static final int REMOVED_BER = 32;
|
||||||
|
private static final int REMOVED_PE = 64;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stem a term (returning its new length).
|
||||||
|
* <p>
|
||||||
|
* Use <code>stemDerivational</code> to control whether full stemming
|
||||||
|
* or only light inflectional stemming is done.
|
||||||
|
*/
|
||||||
|
public int stem(char text[], int length, boolean stemDerivational) {
|
||||||
|
flags = 0;
|
||||||
|
numSyllables = 0;
|
||||||
|
for (int i = 0; i < length; i++)
|
||||||
|
if (isVowel(text[i]))
|
||||||
|
numSyllables++;
|
||||||
|
|
||||||
|
if (numSyllables > 2) length = removeParticle(text, length);
|
||||||
|
if (numSyllables > 2) length = removePossessivePronoun(text, length);
|
||||||
|
|
||||||
|
if (stemDerivational)
|
||||||
|
length = stemDerivational(text, length);
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int stemDerivational(char text[], int length) {
|
||||||
|
int oldLength = length;
|
||||||
|
if (numSyllables > 2) length = removeFirstOrderPrefix(text, length);
|
||||||
|
if (oldLength != length) { // a rule is fired
|
||||||
|
oldLength = length;
|
||||||
|
if (numSyllables > 2) length = removeSuffix(text, length);
|
||||||
|
if (oldLength != length) // a rule is fired
|
||||||
|
if (numSyllables > 2) length = removeSecondOrderPrefix(text, length);
|
||||||
|
} else { // fail
|
||||||
|
if (numSyllables > 2) length = removeSecondOrderPrefix(text, length);
|
||||||
|
if (numSyllables > 2) length = removeSuffix(text, length);
|
||||||
|
}
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isVowel(char ch) {
|
||||||
|
switch(ch) {
|
||||||
|
case 'a':
|
||||||
|
case 'e':
|
||||||
|
case 'i':
|
||||||
|
case 'o':
|
||||||
|
case 'u':
|
||||||
|
return true;
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int removeParticle(char text[], int length) {
|
||||||
|
if (endsWith(text, length, "kah") ||
|
||||||
|
endsWith(text, length, "lah") ||
|
||||||
|
endsWith(text, length, "pun")) {
|
||||||
|
numSyllables--;
|
||||||
|
return length - 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int removePossessivePronoun(char text[], int length) {
|
||||||
|
if (endsWith(text, length, "ku") || endsWith(text, length, "mu")) {
|
||||||
|
numSyllables--;
|
||||||
|
return length - 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (endsWith(text, length, "nya")) {
|
||||||
|
numSyllables--;
|
||||||
|
return length - 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int removeFirstOrderPrefix(char text[], int length) {
|
||||||
|
if (startsWith(text, length, "meng")) {
|
||||||
|
flags |= REMOVED_MENG;
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (startsWith(text, length, "meny") && length > 4 && isVowel(text[4])) {
|
||||||
|
flags |= REMOVED_MENG;
|
||||||
|
text[3] = 's';
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (startsWith(text, length, "men")) {
|
||||||
|
flags |= REMOVED_MENG;
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (startsWith(text, length, "mem")) {
|
||||||
|
flags |= REMOVED_MENG;
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (startsWith(text, length, "me")) {
|
||||||
|
flags |= REMOVED_MENG;
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (startsWith(text, length, "peng")) {
|
||||||
|
flags |= REMOVED_PENG;
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (startsWith(text, length, "peny") && length > 4 && isVowel(text[4])) {
|
||||||
|
flags |= REMOVED_PENG;
|
||||||
|
text[3] = 's';
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (startsWith(text, length, "peny")) {
|
||||||
|
flags |= REMOVED_PENG;
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (startsWith(text, length, "pen") && length > 3 && isVowel(text[3])) {
|
||||||
|
flags |= REMOVED_PENG;
|
||||||
|
text[2] = 't';
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (startsWith(text, length, "pen")) {
|
||||||
|
flags |= REMOVED_PENG;
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (startsWith(text, length, "pem")) {
|
||||||
|
flags |= REMOVED_PENG;
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (startsWith(text, length, "di")) {
|
||||||
|
flags |= REMOVED_DI;
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (startsWith(text, length, "ter")) {
|
||||||
|
flags |= REMOVED_TER;
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (startsWith(text, length, "ke")) {
|
||||||
|
flags |= REMOVED_KE;
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int removeSecondOrderPrefix(char text[], int length) {
|
||||||
|
if (startsWith(text, length, "ber")) {
|
||||||
|
flags |= REMOVED_BER;
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (length == 7 && startsWith(text, length, "belajar")) {
|
||||||
|
flags |= REMOVED_BER;
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (startsWith(text, length, "be") && length > 4
|
||||||
|
&& !isVowel(text[2]) && text[3] == 'e' && text[4] == 'r') {
|
||||||
|
flags |= REMOVED_BER;
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (startsWith(text, length, "per")) {
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (length == 7 && startsWith(text, length, "pelajar")) {
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (startsWith(text, length, "pe")) {
|
||||||
|
flags |= REMOVED_PE;
|
||||||
|
numSyllables--;
|
||||||
|
return deleteN(text, 0, length, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int removeSuffix(char text[], int length) {
|
||||||
|
if (endsWith(text, length, "kan")
|
||||||
|
&& (flags & REMOVED_KE) == 0
|
||||||
|
&& (flags & REMOVED_PENG) == 0
|
||||||
|
&& (flags & REMOVED_PE) == 0) {
|
||||||
|
numSyllables--;
|
||||||
|
return length - 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (endsWith(text, length, "an")
|
||||||
|
&& (flags & REMOVED_DI) == 0
|
||||||
|
&& (flags & REMOVED_MENG) == 0
|
||||||
|
&& (flags & REMOVED_TER) == 0) {
|
||||||
|
numSyllables--;
|
||||||
|
return length - 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (endsWith(text, length, "i")
|
||||||
|
&& !endsWith(text, length, "si")
|
||||||
|
&& (flags & REMOVED_BER) == 0
|
||||||
|
&& (flags & REMOVED_KE) == 0
|
||||||
|
&& (flags & REMOVED_PENG) == 0) {
|
||||||
|
numSyllables--;
|
||||||
|
return length - 1;
|
||||||
|
}
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean startsWith(char s[], int len, String prefix) {
|
||||||
|
final int prefixLen = prefix.length();
|
||||||
|
if (prefixLen > len)
|
||||||
|
return false;
|
||||||
|
for (int i = 0; i < prefixLen; i++)
|
||||||
|
if (s[i] != prefix.charAt(i))
|
||||||
|
return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean endsWith(char s[], int len, String suffix) {
|
||||||
|
final int suffixLen = suffix.length();
|
||||||
|
if (suffixLen > len)
|
||||||
|
return false;
|
||||||
|
for (int i = suffixLen - 1; i >= 0; i--)
|
||||||
|
if (s[len -(suffixLen - i)] != suffix.charAt(i))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int deleteN(char s[], int pos, int len, int nChars) {
|
||||||
|
for (int i = 0; i < nChars; i++)
|
||||||
|
len = delete(s, pos, len);
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int delete(char s[], int pos, int len) {
|
||||||
|
if (pos < len)
|
||||||
|
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
|
||||||
|
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Analyzer for Indonesian.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,359 @@
|
||||||
|
# from appendix D of: A Study of Stemming Effects on Information
|
||||||
|
# Retrieval in Bahasa Indonesia
|
||||||
|
ada
|
||||||
|
adanya
|
||||||
|
adalah
|
||||||
|
adapun
|
||||||
|
agak
|
||||||
|
agaknya
|
||||||
|
agar
|
||||||
|
akan
|
||||||
|
akankah
|
||||||
|
akhirnya
|
||||||
|
aku
|
||||||
|
akulah
|
||||||
|
amat
|
||||||
|
amatlah
|
||||||
|
anda
|
||||||
|
andalah
|
||||||
|
antar
|
||||||
|
diantaranya
|
||||||
|
antara
|
||||||
|
antaranya
|
||||||
|
diantara
|
||||||
|
apa
|
||||||
|
apaan
|
||||||
|
mengapa
|
||||||
|
apabila
|
||||||
|
apakah
|
||||||
|
apalagi
|
||||||
|
apatah
|
||||||
|
atau
|
||||||
|
ataukah
|
||||||
|
ataupun
|
||||||
|
bagai
|
||||||
|
bagaikan
|
||||||
|
sebagai
|
||||||
|
sebagainya
|
||||||
|
bagaimana
|
||||||
|
bagaimanapun
|
||||||
|
sebagaimana
|
||||||
|
bagaimanakah
|
||||||
|
bagi
|
||||||
|
bahkan
|
||||||
|
bahwa
|
||||||
|
bahwasanya
|
||||||
|
sebaliknya
|
||||||
|
banyak
|
||||||
|
sebanyak
|
||||||
|
beberapa
|
||||||
|
seberapa
|
||||||
|
begini
|
||||||
|
beginian
|
||||||
|
beginikah
|
||||||
|
beginilah
|
||||||
|
sebegini
|
||||||
|
begitu
|
||||||
|
begitukah
|
||||||
|
begitulah
|
||||||
|
begitupun
|
||||||
|
sebegitu
|
||||||
|
belum
|
||||||
|
belumlah
|
||||||
|
sebelum
|
||||||
|
sebelumnya
|
||||||
|
sebenarnya
|
||||||
|
berapa
|
||||||
|
berapakah
|
||||||
|
berapalah
|
||||||
|
berapapun
|
||||||
|
betulkah
|
||||||
|
sebetulnya
|
||||||
|
biasa
|
||||||
|
biasanya
|
||||||
|
bila
|
||||||
|
bilakah
|
||||||
|
bisa
|
||||||
|
bisakah
|
||||||
|
sebisanya
|
||||||
|
boleh
|
||||||
|
bolehkah
|
||||||
|
bolehlah
|
||||||
|
buat
|
||||||
|
bukan
|
||||||
|
bukankah
|
||||||
|
bukanlah
|
||||||
|
bukannya
|
||||||
|
cuma
|
||||||
|
percuma
|
||||||
|
dahulu
|
||||||
|
dalam
|
||||||
|
dan
|
||||||
|
dapat
|
||||||
|
dari
|
||||||
|
daripada
|
||||||
|
dekat
|
||||||
|
demi
|
||||||
|
demikian
|
||||||
|
demikianlah
|
||||||
|
sedemikian
|
||||||
|
dengan
|
||||||
|
depan
|
||||||
|
di
|
||||||
|
dia
|
||||||
|
dialah
|
||||||
|
dini
|
||||||
|
diri
|
||||||
|
dirinya
|
||||||
|
terdiri
|
||||||
|
dong
|
||||||
|
dulu
|
||||||
|
enggak
|
||||||
|
enggaknya
|
||||||
|
entah
|
||||||
|
entahlah
|
||||||
|
terhadap
|
||||||
|
terhadapnya
|
||||||
|
hal
|
||||||
|
hampir
|
||||||
|
hanya
|
||||||
|
hanyalah
|
||||||
|
harus
|
||||||
|
haruslah
|
||||||
|
harusnya
|
||||||
|
seharusnya
|
||||||
|
hendak
|
||||||
|
hendaklah
|
||||||
|
hendaknya
|
||||||
|
hingga
|
||||||
|
sehingga
|
||||||
|
ia
|
||||||
|
ialah
|
||||||
|
ibarat
|
||||||
|
ingin
|
||||||
|
inginkah
|
||||||
|
inginkan
|
||||||
|
ini
|
||||||
|
inikah
|
||||||
|
inilah
|
||||||
|
itu
|
||||||
|
itukah
|
||||||
|
itulah
|
||||||
|
jangan
|
||||||
|
jangankan
|
||||||
|
janganlah
|
||||||
|
jika
|
||||||
|
jikalau
|
||||||
|
juga
|
||||||
|
justru
|
||||||
|
kala
|
||||||
|
kalau
|
||||||
|
kalaulah
|
||||||
|
kalaupun
|
||||||
|
kalian
|
||||||
|
kami
|
||||||
|
kamilah
|
||||||
|
kamu
|
||||||
|
kamulah
|
||||||
|
kan
|
||||||
|
kapan
|
||||||
|
kapankah
|
||||||
|
kapanpun
|
||||||
|
dikarenakan
|
||||||
|
karena
|
||||||
|
karenanya
|
||||||
|
ke
|
||||||
|
kecil
|
||||||
|
kemudian
|
||||||
|
kenapa
|
||||||
|
kepada
|
||||||
|
kepadanya
|
||||||
|
ketika
|
||||||
|
seketika
|
||||||
|
khususnya
|
||||||
|
kini
|
||||||
|
kinilah
|
||||||
|
kiranya
|
||||||
|
sekiranya
|
||||||
|
kita
|
||||||
|
kitalah
|
||||||
|
kok
|
||||||
|
lagi
|
||||||
|
lagian
|
||||||
|
selagi
|
||||||
|
lah
|
||||||
|
lain
|
||||||
|
lainnya
|
||||||
|
melainkan
|
||||||
|
selaku
|
||||||
|
lalu
|
||||||
|
melalui
|
||||||
|
terlalu
|
||||||
|
lama
|
||||||
|
lamanya
|
||||||
|
selama
|
||||||
|
selama
|
||||||
|
selamanya
|
||||||
|
lebih
|
||||||
|
terlebih
|
||||||
|
bermacam
|
||||||
|
macam
|
||||||
|
semacam
|
||||||
|
maka
|
||||||
|
makanya
|
||||||
|
makin
|
||||||
|
malah
|
||||||
|
malahan
|
||||||
|
mampu
|
||||||
|
mampukah
|
||||||
|
mana
|
||||||
|
manakala
|
||||||
|
manalagi
|
||||||
|
masih
|
||||||
|
masihkah
|
||||||
|
semasih
|
||||||
|
masing
|
||||||
|
mau
|
||||||
|
maupun
|
||||||
|
semaunya
|
||||||
|
memang
|
||||||
|
mereka
|
||||||
|
merekalah
|
||||||
|
meski
|
||||||
|
meskipun
|
||||||
|
semula
|
||||||
|
mungkin
|
||||||
|
mungkinkah
|
||||||
|
nah
|
||||||
|
namun
|
||||||
|
nanti
|
||||||
|
nantinya
|
||||||
|
nyaris
|
||||||
|
oleh
|
||||||
|
olehnya
|
||||||
|
seorang
|
||||||
|
seseorang
|
||||||
|
pada
|
||||||
|
padanya
|
||||||
|
padahal
|
||||||
|
paling
|
||||||
|
sepanjang
|
||||||
|
pantas
|
||||||
|
sepantasnya
|
||||||
|
sepantasnyalah
|
||||||
|
para
|
||||||
|
pasti
|
||||||
|
pastilah
|
||||||
|
per
|
||||||
|
pernah
|
||||||
|
pula
|
||||||
|
pun
|
||||||
|
merupakan
|
||||||
|
rupanya
|
||||||
|
serupa
|
||||||
|
saat
|
||||||
|
saatnya
|
||||||
|
sesaat
|
||||||
|
saja
|
||||||
|
sajalah
|
||||||
|
saling
|
||||||
|
bersama
|
||||||
|
sama
|
||||||
|
sesama
|
||||||
|
sambil
|
||||||
|
sampai
|
||||||
|
sana
|
||||||
|
sangat
|
||||||
|
sangatlah
|
||||||
|
saya
|
||||||
|
sayalah
|
||||||
|
se
|
||||||
|
sebab
|
||||||
|
sebabnya
|
||||||
|
sebuah
|
||||||
|
tersebut
|
||||||
|
tersebutlah
|
||||||
|
sedang
|
||||||
|
sedangkan
|
||||||
|
sedikit
|
||||||
|
sedikitnya
|
||||||
|
segala
|
||||||
|
segalanya
|
||||||
|
segera
|
||||||
|
sesegera
|
||||||
|
sejak
|
||||||
|
sejenak
|
||||||
|
sekali
|
||||||
|
sekalian
|
||||||
|
sekalipun
|
||||||
|
sesekali
|
||||||
|
sekaligus
|
||||||
|
sekarang
|
||||||
|
sekarang
|
||||||
|
sekitar
|
||||||
|
sekitarnya
|
||||||
|
sela
|
||||||
|
selain
|
||||||
|
selalu
|
||||||
|
seluruh
|
||||||
|
seluruhnya
|
||||||
|
semakin
|
||||||
|
sementara
|
||||||
|
sempat
|
||||||
|
semua
|
||||||
|
semuanya
|
||||||
|
sendiri
|
||||||
|
sendirinya
|
||||||
|
seolah
|
||||||
|
seperti
|
||||||
|
sepertinya
|
||||||
|
sering
|
||||||
|
seringnya
|
||||||
|
serta
|
||||||
|
siapa
|
||||||
|
siapakah
|
||||||
|
siapapun
|
||||||
|
disini
|
||||||
|
disinilah
|
||||||
|
sini
|
||||||
|
sinilah
|
||||||
|
sesuatu
|
||||||
|
sesuatunya
|
||||||
|
suatu
|
||||||
|
sesudah
|
||||||
|
sesudahnya
|
||||||
|
sudah
|
||||||
|
sudahkah
|
||||||
|
sudahlah
|
||||||
|
supaya
|
||||||
|
tadi
|
||||||
|
tadinya
|
||||||
|
tak
|
||||||
|
tanpa
|
||||||
|
setelah
|
||||||
|
telah
|
||||||
|
tentang
|
||||||
|
tentu
|
||||||
|
tentulah
|
||||||
|
tentunya
|
||||||
|
tertentu
|
||||||
|
seterusnya
|
||||||
|
tapi
|
||||||
|
tetapi
|
||||||
|
setiap
|
||||||
|
tiap
|
||||||
|
setidaknya
|
||||||
|
tidak
|
||||||
|
tidakkah
|
||||||
|
tidaklah
|
||||||
|
toh
|
||||||
|
waduh
|
||||||
|
wah
|
||||||
|
wahai
|
||||||
|
sewaktu
|
||||||
|
walau
|
||||||
|
walaupun
|
||||||
|
wong
|
||||||
|
yaitu
|
||||||
|
yakni
|
||||||
|
yang
|
|
@ -0,0 +1,53 @@
|
||||||
|
package org.apache.lucene.analysis.id;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
|
||||||
|
public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
/** This test fails with NPE when the
|
||||||
|
* stopwords file is missing in classpath */
|
||||||
|
public void testResourcesAvailable() {
|
||||||
|
new IndonesianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test stopwords and stemming */
|
||||||
|
public void testBasics() throws IOException {
|
||||||
|
Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
// stemming
|
||||||
|
checkOneTermReuse(a, "peledakan", "ledak");
|
||||||
|
checkOneTermReuse(a, "pembunuhan", "bunuh");
|
||||||
|
// stopword
|
||||||
|
assertAnalyzesTo(a, "bahwa", new String[] {});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test use of exclusion set */
|
||||||
|
public void testExclude() throws IOException {
|
||||||
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
|
exclusionSet.add("peledakan");
|
||||||
|
Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT,
|
||||||
|
IndonesianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
|
checkOneTermReuse(a, "peledakan", "peledakan");
|
||||||
|
checkOneTermReuse(a, "pembunuhan", "bunuh");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,136 @@
|
||||||
|
package org.apache.lucene.analysis.id;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.ReusableAnalyzerBase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests {@link IndonesianStemmer}
|
||||||
|
*/
|
||||||
|
public class TestIndonesianStemmer extends BaseTokenStreamTestCase {
|
||||||
|
/* full stemming, no stopwords */
|
||||||
|
Analyzer a = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
||||||
|
return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Some examples from the paper */
|
||||||
|
public void testExamples() throws IOException {
|
||||||
|
checkOneTerm(a, "bukukah", "buku");
|
||||||
|
checkOneTermReuse(a, "adalah", "ada");
|
||||||
|
checkOneTermReuse(a, "bukupun", "buku");
|
||||||
|
checkOneTermReuse(a, "bukuku", "buku");
|
||||||
|
checkOneTermReuse(a, "bukumu", "buku");
|
||||||
|
checkOneTermReuse(a, "bukunya", "buku");
|
||||||
|
checkOneTermReuse(a, "mengukur", "ukur");
|
||||||
|
checkOneTermReuse(a, "menyapu", "sapu");
|
||||||
|
checkOneTermReuse(a, "menduga", "duga");
|
||||||
|
checkOneTermReuse(a, "menuduh", "uduh");
|
||||||
|
checkOneTermReuse(a, "membaca", "baca");
|
||||||
|
checkOneTermReuse(a, "merusak", "rusak");
|
||||||
|
checkOneTermReuse(a, "pengukur", "ukur");
|
||||||
|
checkOneTermReuse(a, "penyapu", "sapu");
|
||||||
|
checkOneTermReuse(a, "penduga", "duga");
|
||||||
|
checkOneTermReuse(a, "pembaca", "baca");
|
||||||
|
checkOneTermReuse(a, "diukur", "ukur");
|
||||||
|
checkOneTermReuse(a, "tersapu", "sapu");
|
||||||
|
checkOneTermReuse(a, "kekasih", "kasih");
|
||||||
|
checkOneTermReuse(a, "berlari", "lari");
|
||||||
|
checkOneTermReuse(a, "belajar", "ajar");
|
||||||
|
checkOneTermReuse(a, "bekerja", "kerja");
|
||||||
|
checkOneTermReuse(a, "perjelas", "jelas");
|
||||||
|
checkOneTermReuse(a, "pelajar", "ajar");
|
||||||
|
checkOneTermReuse(a, "pekerja", "kerja");
|
||||||
|
checkOneTermReuse(a, "tarikkan", "tarik");
|
||||||
|
checkOneTermReuse(a, "ambilkan", "ambil");
|
||||||
|
checkOneTermReuse(a, "mengambilkan", "ambil");
|
||||||
|
checkOneTermReuse(a, "makanan", "makan");
|
||||||
|
checkOneTermReuse(a, "janjian", "janji");
|
||||||
|
checkOneTermReuse(a, "perjanjian", "janji");
|
||||||
|
checkOneTermReuse(a, "tandai", "tanda");
|
||||||
|
checkOneTermReuse(a, "dapati", "dapat");
|
||||||
|
checkOneTermReuse(a, "mendapati", "dapat");
|
||||||
|
checkOneTermReuse(a, "pantai", "panta");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Some detailed analysis examples (that might not be the best) */
|
||||||
|
public void testIRExamples() throws IOException {
|
||||||
|
checkOneTerm(a, "penyalahgunaan", "salahguna");
|
||||||
|
checkOneTermReuse(a, "menyalahgunakan", "salahguna");
|
||||||
|
checkOneTermReuse(a, "disalahgunakan", "salahguna");
|
||||||
|
|
||||||
|
checkOneTermReuse(a, "pertanggungjawaban", "tanggungjawab");
|
||||||
|
checkOneTermReuse(a, "mempertanggungjawabkan", "tanggungjawab");
|
||||||
|
checkOneTermReuse(a, "dipertanggungjawabkan", "tanggungjawab");
|
||||||
|
|
||||||
|
checkOneTermReuse(a, "pelaksanaan", "laksana");
|
||||||
|
checkOneTermReuse(a, "pelaksana", "laksana");
|
||||||
|
checkOneTermReuse(a, "melaksanakan", "laksana");
|
||||||
|
checkOneTermReuse(a, "dilaksanakan", "laksana");
|
||||||
|
|
||||||
|
checkOneTermReuse(a, "melibatkan", "libat");
|
||||||
|
checkOneTermReuse(a, "terlibat", "libat");
|
||||||
|
|
||||||
|
checkOneTermReuse(a, "penculikan", "culik");
|
||||||
|
checkOneTermReuse(a, "menculik", "culik");
|
||||||
|
checkOneTermReuse(a, "diculik", "culik");
|
||||||
|
checkOneTermReuse(a, "penculik", "culik");
|
||||||
|
|
||||||
|
checkOneTermReuse(a, "perubahan", "ubah");
|
||||||
|
checkOneTermReuse(a, "peledakan", "ledak");
|
||||||
|
checkOneTermReuse(a, "penanganan", "tangan");
|
||||||
|
checkOneTermReuse(a, "kepolisian", "polisi");
|
||||||
|
checkOneTermReuse(a, "kenaikan", "naik");
|
||||||
|
checkOneTermReuse(a, "bersenjata", "senjata");
|
||||||
|
checkOneTermReuse(a, "penyelewengan", "seleweng");
|
||||||
|
checkOneTermReuse(a, "kecelakaan", "celaka");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* inflectional-only stemming */
|
||||||
|
Analyzer b = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
||||||
|
return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer, false));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Test stemming only inflectional suffixes */
|
||||||
|
public void testInflectionalOnly() throws IOException {
|
||||||
|
checkOneTerm(b, "bukunya", "buku");
|
||||||
|
checkOneTermReuse(b, "bukukah", "buku");
|
||||||
|
checkOneTermReuse(b, "bukunyakah", "buku");
|
||||||
|
checkOneTermReuse(b, "dibukukannya", "dibukukan");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testShouldntStem() throws IOException {
|
||||||
|
checkOneTerm(a, "bersenjata", "senjata");
|
||||||
|
checkOneTermReuse(a, "bukukah", "buku");
|
||||||
|
checkOneTermReuse(a, "gigi", "gigi");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,37 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.id.IndonesianStemFilter;
|
||||||
|
|
||||||
|
/** Factory for {@link IndonesianStemFilter} */
|
||||||
|
public class IndonesianStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
private boolean stemDerivational = true;
|
||||||
|
|
||||||
|
public void init(Map<String, String> args) {
|
||||||
|
super.init(args);
|
||||||
|
stemDerivational = getBoolean("stemDerivational", true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new IndonesianStemFilter(input, stemDerivational);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,59 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the Indonesian stem filter factory is working.
|
||||||
|
*/
|
||||||
|
public class TestIndonesianStemFilterFactory extends BaseTokenTestCase {
|
||||||
|
/**
|
||||||
|
* Ensure the filter actually stems text.
|
||||||
|
*/
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("dibukukannya");
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
|
IndonesianStemFilterFactory factory = new IndonesianStemFilterFactory();
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
factory.init(args);
|
||||||
|
TokenStream stream = factory.create(tokenizer);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "buku" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test inflectional-only mode
|
||||||
|
*/
|
||||||
|
public void testStemmingInflectional() throws Exception {
|
||||||
|
Reader reader = new StringReader("dibukukannya");
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
|
IndonesianStemFilterFactory factory = new IndonesianStemFilterFactory();
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("stemDerivational", "false");
|
||||||
|
factory.init(args);
|
||||||
|
TokenStream stream = factory.create(tokenizer);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "dibukukan" });
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue