mirror of https://github.com/apache/lucene.git
LUCENE-2234: Hindi Analyzer
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@906468 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1f8951f06c
commit
23d403b6bb
|
@ -28,6 +28,11 @@ stopword list that is BSD-licensed created by Jacques Savoy. The file resides i
|
||||||
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt.
|
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt.
|
||||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
|
||||||
|
The Hindi analyzer (contrib/analyzers) comes with a default
|
||||||
|
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
|
||||||
|
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt.
|
||||||
|
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
|
||||||
Includes lib/servlet-api-2.4.jar from Apache Tomcat
|
Includes lib/servlet-api-2.4.jar from Apache Tomcat
|
||||||
|
|
||||||
The SmartChineseAnalyzer source code (under contrib/analyzers) was
|
The SmartChineseAnalyzer source code (under contrib/analyzers) was
|
||||||
|
|
|
@ -103,6 +103,8 @@ New features
|
||||||
character is now configurable. Its also up to 20% faster.
|
character is now configurable. Its also up to 20% faster.
|
||||||
(Steven Rowe via Robert Muir)
|
(Steven Rowe via Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-2234: Add a Hindi analyzer. (Robert Muir)
|
||||||
|
|
||||||
Build
|
Build
|
||||||
|
|
||||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||||
|
|
|
@ -0,0 +1,132 @@
|
||||||
|
package org.apache.lucene.analysis.hi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
|
||||||
|
import org.apache.lucene.analysis.in.IndicTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyzer for Hindi.
|
||||||
|
*/
|
||||||
|
public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
private final Set<?> stemExclusionSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* File containing default Hindi stopwords.
|
||||||
|
*
|
||||||
|
* Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
|
||||||
|
* The stopword list is BSD-Licensed.
|
||||||
|
*/
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
private static final String STOPWORDS_COMMENT = "#";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop-words set.
|
||||||
|
* @return an unmodifiable instance of the default stop-words set.
|
||||||
|
*/
|
||||||
|
public static Set<?> getDefaultStopSet(){
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||||
|
* accesses the static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final Set<?> DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET = loadStopwordSet(false, HindiAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words
|
||||||
|
*
|
||||||
|
* @param version lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
* @param stemExclusionSet a stemming exclusion set
|
||||||
|
*/
|
||||||
|
public HindiAnalyzer(Version version, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||||
|
super(version, stopwords);
|
||||||
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(
|
||||||
|
CharArraySet.copy(matchVersion, stemExclusionSet));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words
|
||||||
|
*
|
||||||
|
* @param version lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
*/
|
||||||
|
public HindiAnalyzer(Version version, Set<?> stopwords) {
|
||||||
|
this(version, stopwords, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the default stop words:
|
||||||
|
* {@link #DEFAULT_STOPWORD_FILE}.
|
||||||
|
*/
|
||||||
|
public HindiAnalyzer(Version version) {
|
||||||
|
this(version, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
|
||||||
|
* {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return {@link TokenStreamComponents} built from a {@link IndicTokenizer}
|
||||||
|
* filtered with {@link LowerCaseFilter},
|
||||||
|
* {@link IndicNormalizationFilter},
|
||||||
|
* {@link HindiNormalizationFilter},
|
||||||
|
* {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
|
||||||
|
* {@link HindiStemFilter}, and Hindi Stop words
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
final Tokenizer source = new IndicTokenizer(matchVersion, reader);
|
||||||
|
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||||
|
if (!stemExclusionSet.isEmpty())
|
||||||
|
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||||
|
result = new IndicNormalizationFilter(result);
|
||||||
|
result = new HindiNormalizationFilter(result);
|
||||||
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
|
result = new HindiStemFilter(result);
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,59 @@
|
||||||
|
package org.apache.lucene.analysis.hi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link HindiNormalizer} to normalize the
|
||||||
|
* orthography.
|
||||||
|
* <p>
|
||||||
|
* In some cases the normalization may cause unrelated terms to conflate, so
|
||||||
|
* to prevent terms from being normalized use an instance of
|
||||||
|
* {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
|
||||||
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
|
* </p>
|
||||||
|
* @see HindiNormalizer
|
||||||
|
*/
|
||||||
|
public final class HindiNormalizationFilter extends TokenFilter {
|
||||||
|
|
||||||
|
private final HindiNormalizer normalizer = new HindiNormalizer();
|
||||||
|
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
public HindiNormalizationFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAtt.isKeyword())
|
||||||
|
termAtt.setTermLength(normalizer.normalize(termAtt.termBuffer(),
|
||||||
|
termAtt.termLength()));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,194 @@
|
||||||
|
package org.apache.lucene.analysis.hi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalizer for Hindi.
|
||||||
|
* <p>
|
||||||
|
* Normalizes text to remove some differences in spelling variations.
|
||||||
|
* <p>
|
||||||
|
* Implements the Hindi-language specific algorithm specified in:
|
||||||
|
* <i>Word normalization in Indian languages</i>
|
||||||
|
* Prasad Pingali and Vasudeva Varma.
|
||||||
|
* http://web2py.iiit.ac.in/publications/default/download/inproceedings.pdf.3fe5b38c-02ee-41ce-9a8f-3e745670be32.pdf
|
||||||
|
* <p>
|
||||||
|
* with the following additions from <i>Hindi CLIR in Thirty Days</i>
|
||||||
|
* Leah S. Larkey, Margaret E. Connell, and Nasreen AbdulJaleel.
|
||||||
|
* http://maroo.cs.umass.edu/pub/web/getpdf.php?id=454:
|
||||||
|
* <ul>
|
||||||
|
* <li>Internal Zero-width joiner and Zero-width non-joiners are removed
|
||||||
|
* <li>In addition to chandrabindu, NA+halant is normalized to anusvara
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class HindiNormalizer {
|
||||||
|
/**
|
||||||
|
* Normalize an input buffer of Hindi text
|
||||||
|
*
|
||||||
|
* @param s input buffer
|
||||||
|
* @param len length of input buffer
|
||||||
|
* @return length of input buffer after normalization
|
||||||
|
*/
|
||||||
|
public int normalize(char s[], int len) {
|
||||||
|
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
switch (s[i]) {
|
||||||
|
// dead n -> bindu
|
||||||
|
case '\u0928':
|
||||||
|
if (i + 1 < len && s[i + 1] == '\u094D') {
|
||||||
|
s[i] = '\u0902';
|
||||||
|
len = delete(s, i + 1, len);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
// candrabindu -> bindu
|
||||||
|
case '\u0901':
|
||||||
|
s[i] = '\u0902';
|
||||||
|
break;
|
||||||
|
// nukta deletions
|
||||||
|
case '\u093C':
|
||||||
|
len = delete(s, i, len);
|
||||||
|
i--;
|
||||||
|
break;
|
||||||
|
case '\u0929':
|
||||||
|
s[i] = '\u0928';
|
||||||
|
break;
|
||||||
|
case '\u0931':
|
||||||
|
s[i] = '\u0930';
|
||||||
|
break;
|
||||||
|
case '\u0934':
|
||||||
|
s[i] = '\u0933';
|
||||||
|
break;
|
||||||
|
case '\u0958':
|
||||||
|
s[i] = '\u0915';
|
||||||
|
break;
|
||||||
|
case '\u0959':
|
||||||
|
s[i] = '\u0916';
|
||||||
|
break;
|
||||||
|
case '\u095A':
|
||||||
|
s[i] = '\u0917';
|
||||||
|
break;
|
||||||
|
case '\u095B':
|
||||||
|
s[i] = '\u091C';
|
||||||
|
break;
|
||||||
|
case '\u095C':
|
||||||
|
s[i] = '\u0921';
|
||||||
|
break;
|
||||||
|
case '\u095D':
|
||||||
|
s[i] = '\u0922';
|
||||||
|
break;
|
||||||
|
case '\u095E':
|
||||||
|
s[i] = '\u092B';
|
||||||
|
break;
|
||||||
|
case '\u095F':
|
||||||
|
s[i] = '\u092F';
|
||||||
|
break;
|
||||||
|
// zwj/zwnj -> delete
|
||||||
|
case '\u200D':
|
||||||
|
case '\u200C':
|
||||||
|
len = delete(s, i, len);
|
||||||
|
i--;
|
||||||
|
break;
|
||||||
|
// virama -> delete
|
||||||
|
case '\u094D':
|
||||||
|
len = delete(s, i, len);
|
||||||
|
i--;
|
||||||
|
break;
|
||||||
|
// chandra/short -> replace
|
||||||
|
case '\u0945':
|
||||||
|
case '\u0946':
|
||||||
|
s[i] = '\u0947';
|
||||||
|
break;
|
||||||
|
case '\u0949':
|
||||||
|
case '\u094A':
|
||||||
|
s[i] = '\u094B';
|
||||||
|
break;
|
||||||
|
case '\u090D':
|
||||||
|
case '\u090E':
|
||||||
|
s[i] = '\u090F';
|
||||||
|
break;
|
||||||
|
case '\u0911':
|
||||||
|
case '\u0912':
|
||||||
|
s[i] = '\u0913';
|
||||||
|
break;
|
||||||
|
case '\u0972':
|
||||||
|
s[i] = '\u0905';
|
||||||
|
break;
|
||||||
|
// long -> short ind. vowels
|
||||||
|
case '\u0906':
|
||||||
|
s[i] = '\u0905';
|
||||||
|
break;
|
||||||
|
case '\u0908':
|
||||||
|
s[i] = '\u0907';
|
||||||
|
break;
|
||||||
|
case '\u090A':
|
||||||
|
s[i] = '\u0909';
|
||||||
|
break;
|
||||||
|
case '\u0960':
|
||||||
|
s[i] = '\u090B';
|
||||||
|
break;
|
||||||
|
case '\u0961':
|
||||||
|
s[i] = '\u090C';
|
||||||
|
break;
|
||||||
|
case '\u0910':
|
||||||
|
s[i] = '\u090F';
|
||||||
|
break;
|
||||||
|
case '\u0914':
|
||||||
|
s[i] = '\u0913';
|
||||||
|
break;
|
||||||
|
// long -> short dep. vowels
|
||||||
|
case '\u0940':
|
||||||
|
s[i] = '\u093F';
|
||||||
|
break;
|
||||||
|
case '\u0942':
|
||||||
|
s[i] = '\u0941';
|
||||||
|
break;
|
||||||
|
case '\u0944':
|
||||||
|
s[i] = '\u0943';
|
||||||
|
break;
|
||||||
|
case '\u0963':
|
||||||
|
s[i] = '\u0962';
|
||||||
|
break;
|
||||||
|
case '\u0948':
|
||||||
|
s[i] = '\u0947';
|
||||||
|
break;
|
||||||
|
case '\u094C':
|
||||||
|
s[i] = '\u094B';
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete a character in-place
|
||||||
|
*
|
||||||
|
* @param s Input Buffer
|
||||||
|
* @param pos Position of character to delete
|
||||||
|
* @param len length of input buffer
|
||||||
|
* @return length of input buffer after deletion
|
||||||
|
*/
|
||||||
|
protected int delete(char s[], int pos, int len) {
|
||||||
|
if (pos < len)
|
||||||
|
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
|
||||||
|
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,49 @@
|
||||||
|
package org.apache.lucene.analysis.hi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link HindiStemmer} to stem Hindi words.
|
||||||
|
*/
|
||||||
|
public final class HindiStemFilter extends TokenFilter {
|
||||||
|
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||||
|
private final HindiStemmer stemmer = new HindiStemmer();
|
||||||
|
|
||||||
|
protected HindiStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAtt.isKeyword())
|
||||||
|
termAtt.setTermLength(stemmer.stem(termAtt.termBuffer(), termAtt.termLength()));
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,130 @@
|
||||||
|
package org.apache.lucene.analysis.hi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Light Stemmer for Hindi.
|
||||||
|
* <p>
|
||||||
|
* Implements the algorithm specified in:
|
||||||
|
* <i>A Lightweight Stemmer for Hindi</i>
|
||||||
|
* Ananthakrishnan Ramanathan and Durgesh D Rao.
|
||||||
|
* http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public class HindiStemmer {
|
||||||
|
public int stem(char buffer[], int len) {
|
||||||
|
// 5
|
||||||
|
if ((len > 6) && (endsWith(buffer, len, "ाएंगी")
|
||||||
|
|| endsWith(buffer, len, "ाएंगे")
|
||||||
|
|| endsWith(buffer, len, "ाऊंगी")
|
||||||
|
|| endsWith(buffer, len, "ाऊंगा")
|
||||||
|
|| endsWith(buffer, len, "ाइयाँ")
|
||||||
|
|| endsWith(buffer, len, "ाइयों")
|
||||||
|
|| endsWith(buffer, len, "ाइयां")
|
||||||
|
))
|
||||||
|
return len - 5;
|
||||||
|
|
||||||
|
// 4
|
||||||
|
if ((len > 5) && (endsWith(buffer, len, "ाएगी")
|
||||||
|
|| endsWith(buffer, len, "ाएगा")
|
||||||
|
|| endsWith(buffer, len, "ाओगी")
|
||||||
|
|| endsWith(buffer, len, "ाओगे")
|
||||||
|
|| endsWith(buffer, len, "एंगी")
|
||||||
|
|| endsWith(buffer, len, "ेंगी")
|
||||||
|
|| endsWith(buffer, len, "एंगे")
|
||||||
|
|| endsWith(buffer, len, "ेंगे")
|
||||||
|
|| endsWith(buffer, len, "ूंगी")
|
||||||
|
|| endsWith(buffer, len, "ूंगा")
|
||||||
|
|| endsWith(buffer, len, "ातीं")
|
||||||
|
|| endsWith(buffer, len, "नाओं")
|
||||||
|
|| endsWith(buffer, len, "नाएं")
|
||||||
|
|| endsWith(buffer, len, "ताओं")
|
||||||
|
|| endsWith(buffer, len, "ताएं")
|
||||||
|
|| endsWith(buffer, len, "ियाँ")
|
||||||
|
|| endsWith(buffer, len, "ियों")
|
||||||
|
|| endsWith(buffer, len, "ियां")
|
||||||
|
))
|
||||||
|
return len - 4;
|
||||||
|
|
||||||
|
// 3
|
||||||
|
if ((len > 4) && (endsWith(buffer, len, "ाकर")
|
||||||
|
|| endsWith(buffer, len, "ाइए")
|
||||||
|
|| endsWith(buffer, len, "ाईं")
|
||||||
|
|| endsWith(buffer, len, "ाया")
|
||||||
|
|| endsWith(buffer, len, "ेगी")
|
||||||
|
|| endsWith(buffer, len, "ेगा")
|
||||||
|
|| endsWith(buffer, len, "ोगी")
|
||||||
|
|| endsWith(buffer, len, "ोगे")
|
||||||
|
|| endsWith(buffer, len, "ाने")
|
||||||
|
|| endsWith(buffer, len, "ाना")
|
||||||
|
|| endsWith(buffer, len, "ाते")
|
||||||
|
|| endsWith(buffer, len, "ाती")
|
||||||
|
|| endsWith(buffer, len, "ाता")
|
||||||
|
|| endsWith(buffer, len, "तीं")
|
||||||
|
|| endsWith(buffer, len, "ाओं")
|
||||||
|
|| endsWith(buffer, len, "ाएं")
|
||||||
|
|| endsWith(buffer, len, "ुओं")
|
||||||
|
|| endsWith(buffer, len, "ुएं")
|
||||||
|
|| endsWith(buffer, len, "ुआं")
|
||||||
|
))
|
||||||
|
return len - 3;
|
||||||
|
|
||||||
|
// 2
|
||||||
|
if ((len > 3) && (endsWith(buffer, len, "कर")
|
||||||
|
|| endsWith(buffer, len, "ाओ")
|
||||||
|
|| endsWith(buffer, len, "िए")
|
||||||
|
|| endsWith(buffer, len, "ाई")
|
||||||
|
|| endsWith(buffer, len, "ाए")
|
||||||
|
|| endsWith(buffer, len, "ने")
|
||||||
|
|| endsWith(buffer, len, "नी")
|
||||||
|
|| endsWith(buffer, len, "ना")
|
||||||
|
|| endsWith(buffer, len, "ते")
|
||||||
|
|| endsWith(buffer, len, "ीं")
|
||||||
|
|| endsWith(buffer, len, "ती")
|
||||||
|
|| endsWith(buffer, len, "ता")
|
||||||
|
|| endsWith(buffer, len, "ाँ")
|
||||||
|
|| endsWith(buffer, len, "ां")
|
||||||
|
|| endsWith(buffer, len, "ों")
|
||||||
|
|| endsWith(buffer, len, "ें")
|
||||||
|
))
|
||||||
|
return len - 2;
|
||||||
|
|
||||||
|
// 1
|
||||||
|
if ((len > 2) && (endsWith(buffer, len, "ो")
|
||||||
|
|| endsWith(buffer, len, "े")
|
||||||
|
|| endsWith(buffer, len, "ू")
|
||||||
|
|| endsWith(buffer, len, "ु")
|
||||||
|
|| endsWith(buffer, len, "ी")
|
||||||
|
|| endsWith(buffer, len, "ि")
|
||||||
|
|| endsWith(buffer, len, "ा")
|
||||||
|
))
|
||||||
|
return len - 1;
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean endsWith(final char s[], final int len, final String suffix) {
|
||||||
|
final int suffixLen = suffix.length();
|
||||||
|
if (suffixLen > len)
|
||||||
|
return false;
|
||||||
|
for (int i = suffixLen - 1; i >= 0; i--)
|
||||||
|
if (s[len -(suffixLen - i)] != suffix.charAt(i))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Analyzer for Hindi.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,47 @@
|
||||||
|
package org.apache.lucene.analysis.in;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link IndicNormalizer} to normalize text
|
||||||
|
* in Indian Languages.
|
||||||
|
*/
|
||||||
|
public final class IndicNormalizationFilter extends TokenFilter {
|
||||||
|
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
||||||
|
private final IndicNormalizer normalizer = new IndicNormalizer();
|
||||||
|
|
||||||
|
public IndicNormalizationFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
termAtt.setTermLength(normalizer.normalize(termAtt.termBuffer(), termAtt.termLength()));
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,303 @@
|
||||||
|
package org.apache.lucene.analysis.in;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.BitSet;
|
||||||
|
import java.util.IdentityHashMap;
|
||||||
|
import static java.lang.Character.UnicodeBlock.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalizes the Unicode representation of text in Indian languages.
|
||||||
|
* <p>
|
||||||
|
* Follows guidelines from Unicode 5.2, chapter 6, South Asian Scripts I
|
||||||
|
* and graphical decompositions from http://ldc.upenn.edu/myl/IndianScriptsUnicode.html
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public class IndicNormalizer {
|
||||||
|
|
||||||
|
private static class ScriptData {
|
||||||
|
final int flag;
|
||||||
|
final int base;
|
||||||
|
BitSet decompMask;
|
||||||
|
|
||||||
|
ScriptData(int flag, int base) {
|
||||||
|
this.flag = flag;
|
||||||
|
this.base = base;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final IdentityHashMap<Character.UnicodeBlock,ScriptData> scripts =
|
||||||
|
new IdentityHashMap<Character.UnicodeBlock,ScriptData>(9);
|
||||||
|
|
||||||
|
private static int flag(Character.UnicodeBlock ub) {
|
||||||
|
return scripts.get(ub).flag;
|
||||||
|
}
|
||||||
|
|
||||||
|
static {
|
||||||
|
scripts.put(DEVANAGARI, new ScriptData(1, 0x0900));
|
||||||
|
scripts.put(BENGALI, new ScriptData(2, 0x0980));
|
||||||
|
scripts.put(GURMUKHI, new ScriptData(4, 0x0A00));
|
||||||
|
scripts.put(GUJARATI, new ScriptData(8, 0x0A80));
|
||||||
|
scripts.put(ORIYA, new ScriptData(16, 0x0B00));
|
||||||
|
scripts.put(TAMIL, new ScriptData(32, 0x0B80));
|
||||||
|
scripts.put(TELUGU, new ScriptData(64, 0x0C00));
|
||||||
|
scripts.put(KANNADA, new ScriptData(128, 0x0C80));
|
||||||
|
scripts.put(MALAYALAM, new ScriptData(256, 0x0D00));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decompositions according to Unicode 5.2,
|
||||||
|
* and http://ldc.upenn.edu/myl/IndianScriptsUnicode.html
|
||||||
|
*
|
||||||
|
* Most of these are not handled by unicode normalization anyway.
|
||||||
|
*
|
||||||
|
* The numbers here represent offsets into the respective codepages,
|
||||||
|
* with -1 representing null and 0xFF representing zero-width joiner.
|
||||||
|
*
|
||||||
|
* the columns are: ch1, ch2, ch3, res, flags
|
||||||
|
* ch1, ch2, and ch3 are the decomposition
|
||||||
|
* res is the composition, and flags are the scripts to which it applies.
|
||||||
|
*/
|
||||||
|
private static final int decompositions[][] = {
|
||||||
|
/* devanagari, gujarati vowel candra O */
|
||||||
|
{ 0x05, 0x3E, 0x45, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||||
|
/* devanagari short O */
|
||||||
|
{ 0x05, 0x3E, 0x46, 0x12, flag(DEVANAGARI) },
|
||||||
|
/* devanagari, gujarati letter O */
|
||||||
|
{ 0x05, 0x3E, 0x47, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||||
|
/* devanagari letter AI, gujarati letter AU */
|
||||||
|
{ 0x05, 0x3E, 0x48, 0x14, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||||
|
/* devanagari, bengali, gurmukhi, gujarati, oriya AA */
|
||||||
|
{ 0x05, 0x3E, -1, 0x06, flag(DEVANAGARI) | flag(BENGALI) | flag(GURMUKHI) | flag(GUJARATI) | flag(ORIYA) },
|
||||||
|
/* devanagari letter candra A */
|
||||||
|
{ 0x05, 0x45, -1, 0x72, flag(DEVANAGARI) },
|
||||||
|
/* gujarati vowel candra E */
|
||||||
|
{ 0x05, 0x45, -1, 0x0D, flag(GUJARATI) },
|
||||||
|
/* devanagari letter short A */
|
||||||
|
{ 0x05, 0x46, -1, 0x04, flag(DEVANAGARI) },
|
||||||
|
/* gujarati letter E */
|
||||||
|
{ 0x05, 0x47, -1, 0x0F, flag(GUJARATI) },
|
||||||
|
/* gurmukhi, gujarati letter AI */
|
||||||
|
{ 0x05, 0x48, -1, 0x10, flag(GURMUKHI) | flag(GUJARATI) },
|
||||||
|
/* devanagari, gujarati vowel candra O */
|
||||||
|
{ 0x05, 0x49, -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||||
|
/* devanagari short O */
|
||||||
|
{ 0x05, 0x4A, -1, 0x12, flag(DEVANAGARI) },
|
||||||
|
/* devanagari, gujarati letter O */
|
||||||
|
{ 0x05, 0x4B, -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||||
|
/* devanagari letter AI, gurmukhi letter AU, gujarati letter AU */
|
||||||
|
{ 0x05, 0x4C, -1, 0x14, flag(DEVANAGARI) | flag(GURMUKHI) | flag(GUJARATI) },
|
||||||
|
/* devanagari, gujarati vowel candra O */
|
||||||
|
{ 0x06, 0x45, -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||||
|
/* devanagari short O */
|
||||||
|
{ 0x06, 0x46, -1, 0x12, flag(DEVANAGARI) },
|
||||||
|
/* devanagari, gujarati letter O */
|
||||||
|
{ 0x06, 0x47, -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||||
|
/* devanagari letter AI, gujarati letter AU */
|
||||||
|
{ 0x06, 0x48, -1, 0x14, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||||
|
/* malayalam letter II */
|
||||||
|
{ 0x07, 0x57, -1, 0x08, flag(MALAYALAM) },
|
||||||
|
/* devanagari letter UU */
|
||||||
|
{ 0x09, 0x41, -1, 0x0A, flag(DEVANAGARI) },
|
||||||
|
/* tamil, malayalam letter UU (some styles) */
|
||||||
|
{ 0x09, 0x57, -1, 0x0A, flag(TAMIL) | flag(MALAYALAM) },
|
||||||
|
/* malayalam letter AI */
|
||||||
|
{ 0x0E, 0x46, -1, 0x10, flag(MALAYALAM) },
|
||||||
|
/* devanagari candra E */
|
||||||
|
{ 0x0F, 0x45, -1, 0x0D, flag(DEVANAGARI) },
|
||||||
|
/* devanagari short E */
|
||||||
|
{ 0x0F, 0x46, -1, 0x0E, flag(DEVANAGARI) },
|
||||||
|
/* devanagari AI */
|
||||||
|
{ 0x0F, 0x47, -1, 0x10, flag(DEVANAGARI) },
|
||||||
|
/* oriya AI */
|
||||||
|
{ 0x0F, 0x57, -1, 0x10, flag(ORIYA) },
|
||||||
|
/* malayalam letter OO */
|
||||||
|
{ 0x12, 0x3E, -1, 0x13, flag(MALAYALAM) },
|
||||||
|
/* telugu, kannada letter AU */
|
||||||
|
{ 0x12, 0x4C, -1, 0x14, flag(TELUGU) | flag(KANNADA) },
|
||||||
|
/* telugu letter OO */
|
||||||
|
{ 0x12, 0x55, -1, 0x13, flag(TELUGU) },
|
||||||
|
/* tamil, malayalam letter AU */
|
||||||
|
{ 0x12, 0x57, -1, 0x14, flag(TAMIL) | flag(MALAYALAM) },
|
||||||
|
/* oriya letter AU */
|
||||||
|
{ 0x13, 0x57, -1, 0x14, flag(ORIYA) },
|
||||||
|
/* devanagari qa */
|
||||||
|
{ 0x15, 0x3C, -1, 0x58, flag(DEVANAGARI) },
|
||||||
|
/* devanagari, gurmukhi khha */
|
||||||
|
{ 0x16, 0x3C, -1, 0x59, flag(DEVANAGARI) | flag(GURMUKHI) },
|
||||||
|
/* devanagari, gurmukhi ghha */
|
||||||
|
{ 0x17, 0x3C, -1, 0x5A, flag(DEVANAGARI) | flag(GURMUKHI) },
|
||||||
|
/* devanagari, gurmukhi za */
|
||||||
|
{ 0x1C, 0x3C, -1, 0x5B, flag(DEVANAGARI) | flag(GURMUKHI) },
|
||||||
|
/* devanagari dddha, bengali, oriya rra */
|
||||||
|
{ 0x21, 0x3C, -1, 0x5C, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
|
||||||
|
/* devanagari, bengali, oriya rha */
|
||||||
|
{ 0x22, 0x3C, -1, 0x5D, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
|
||||||
|
/* malayalam chillu nn */
|
||||||
|
{ 0x23, 0x4D, 0xFF, 0x7A, flag(MALAYALAM) },
|
||||||
|
/* bengali khanda ta */
|
||||||
|
{ 0x24, 0x4D, 0xFF, 0x4E, flag(BENGALI) },
|
||||||
|
/* devanagari nnna */
|
||||||
|
{ 0x28, 0x3C, -1, 0x29, flag(DEVANAGARI) },
|
||||||
|
/* malayalam chillu n */
|
||||||
|
{ 0x28, 0x4D, 0xFF, 0x7B, flag(MALAYALAM) },
|
||||||
|
/* devanagari, gurmukhi fa */
|
||||||
|
{ 0x2B, 0x3C, -1, 0x5E, flag(DEVANAGARI) | flag(GURMUKHI) },
|
||||||
|
/* devanagari, bengali yya */
|
||||||
|
{ 0x2F, 0x3C, -1, 0x5F, flag(DEVANAGARI) | flag(BENGALI) },
|
||||||
|
/* telugu letter vocalic R */
|
||||||
|
{ 0x2C, 0x41, 0x41, 0x0B, flag(TELUGU) },
|
||||||
|
/* devanagari rra */
|
||||||
|
{ 0x30, 0x3C, -1, 0x31, flag(DEVANAGARI) },
|
||||||
|
/* malayalam chillu rr */
|
||||||
|
{ 0x30, 0x4D, 0xFF, 0x7C, flag(MALAYALAM) },
|
||||||
|
/* malayalam chillu l */
|
||||||
|
{ 0x32, 0x4D, 0xFF, 0x7D, flag(MALAYALAM) },
|
||||||
|
/* devanagari llla */
|
||||||
|
{ 0x33, 0x3C, -1, 0x34, flag(DEVANAGARI) },
|
||||||
|
/* malayalam chillu ll */
|
||||||
|
{ 0x33, 0x4D, 0xFF, 0x7E, flag(MALAYALAM) },
|
||||||
|
/* telugu letter MA */
|
||||||
|
{ 0x35, 0x41, -1, 0x2E, flag(TELUGU) },
|
||||||
|
/* devanagari, gujarati vowel sign candra O */
|
||||||
|
{ 0x3E, 0x45, -1, 0x49, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||||
|
/* devanagari vowel sign short O */
|
||||||
|
{ 0x3E, 0x46, -1, 0x4A, flag(DEVANAGARI) },
|
||||||
|
/* devanagari, gujarati vowel sign O */
|
||||||
|
{ 0x3E, 0x47, -1, 0x4B, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||||
|
/* devanagari, gujarati vowel sign AU */
|
||||||
|
{ 0x3E, 0x48, -1, 0x4C, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||||
|
/* kannada vowel sign II */
|
||||||
|
{ 0x3F, 0x55, -1, 0x40, flag(KANNADA) },
|
||||||
|
/* gurmukhi vowel sign UU (when stacking) */
|
||||||
|
{ 0x41, 0x41, -1, 0x42, flag(GURMUKHI) },
|
||||||
|
/* tamil, malayalam vowel sign O */
|
||||||
|
{ 0x46, 0x3E, -1, 0x4A, flag(TAMIL) | flag(MALAYALAM) },
|
||||||
|
/* kannada vowel sign OO */
|
||||||
|
{ 0x46, 0x42, 0x55, 0x4B, flag(KANNADA) },
|
||||||
|
/* kannada vowel sign O */
|
||||||
|
{ 0x46, 0x42, -1, 0x4A, flag(KANNADA) },
|
||||||
|
/* malayalam vowel sign AI (if reordered twice) */
|
||||||
|
{ 0x46, 0x46, -1, 0x48, flag(MALAYALAM) },
|
||||||
|
/* telugu, kannada vowel sign EE */
|
||||||
|
{ 0x46, 0x55, -1, 0x47, flag(TELUGU) | flag(KANNADA) },
|
||||||
|
/* telugu, kannada vowel sign AI */
|
||||||
|
{ 0x46, 0x56, -1, 0x48, flag(TELUGU) | flag(KANNADA) },
|
||||||
|
/* tamil, malayalam vowel sign AU */
|
||||||
|
{ 0x46, 0x57, -1, 0x4C, flag(TAMIL) | flag(MALAYALAM) },
|
||||||
|
/* bengali, oriya vowel sign O, tamil, malayalam vowel sign OO */
|
||||||
|
{ 0x47, 0x3E, -1, 0x4B, flag(BENGALI) | flag(ORIYA) | flag(TAMIL) | flag(MALAYALAM) },
|
||||||
|
/* bengali, oriya vowel sign AU */
|
||||||
|
{ 0x47, 0x57, -1, 0x4C, flag(BENGALI) | flag(ORIYA) },
|
||||||
|
/* kannada vowel sign OO */
|
||||||
|
{ 0x4A, 0x55, -1, 0x4B, flag(KANNADA) },
|
||||||
|
/* gurmukhi letter I */
|
||||||
|
{ 0x72, 0x3F, -1, 0x07, flag(GURMUKHI) },
|
||||||
|
/* gurmukhi letter II */
|
||||||
|
{ 0x72, 0x40, -1, 0x08, flag(GURMUKHI) },
|
||||||
|
/* gurmukhi letter EE */
|
||||||
|
{ 0x72, 0x47, -1, 0x0F, flag(GURMUKHI) },
|
||||||
|
/* gurmukhi letter U */
|
||||||
|
{ 0x73, 0x41, -1, 0x09, flag(GURMUKHI) },
|
||||||
|
/* gurmukhi letter UU */
|
||||||
|
{ 0x73, 0x42, -1, 0x0A, flag(GURMUKHI) },
|
||||||
|
/* gurmukhi letter OO */
|
||||||
|
{ 0x73, 0x4B, -1, 0x13, flag(GURMUKHI) },
|
||||||
|
};
|
||||||
|
|
||||||
|
static {
|
||||||
|
for (ScriptData sd : scripts.values()) {
|
||||||
|
sd.decompMask = new BitSet(0x7F);
|
||||||
|
for (int i = 0; i < decompositions.length; i++) {
|
||||||
|
final int ch = decompositions[i][0];
|
||||||
|
final int flags = decompositions[i][4];
|
||||||
|
if ((flags & sd.flag) != 0)
|
||||||
|
sd.decompMask.set(ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalizes input text, and returns the new length.
|
||||||
|
* The length will always be less than or equal to the existing length.
|
||||||
|
*
|
||||||
|
* @param text input text
|
||||||
|
* @param len valid length
|
||||||
|
* @return normalized length
|
||||||
|
*/
|
||||||
|
public int normalize(char text[], int len) {
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
final Character.UnicodeBlock block = Character.UnicodeBlock.of(text[i]);
|
||||||
|
final ScriptData sd = scripts.get(block);
|
||||||
|
if (sd != null) {
|
||||||
|
final int ch = text[i] - sd.base;
|
||||||
|
if (sd.decompMask.get(ch))
|
||||||
|
len = compose(ch, block, sd, text, i, len);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compose into standard form any compositions in the decompositions table.
|
||||||
|
*/
|
||||||
|
private int compose(int ch0, Character.UnicodeBlock block0, ScriptData sd,
|
||||||
|
char text[], int pos, int len) {
|
||||||
|
if (pos + 1 >= len) /* need at least 2 chars! */
|
||||||
|
return len;
|
||||||
|
|
||||||
|
final int ch1 = text[pos + 1] - sd.base;
|
||||||
|
final Character.UnicodeBlock block1 = Character.UnicodeBlock.of(text[pos + 1]);
|
||||||
|
if (block1 != block0) /* needs to be the same writing system */
|
||||||
|
return len;
|
||||||
|
|
||||||
|
int ch2 = -1;
|
||||||
|
|
||||||
|
if (pos + 2 < len) {
|
||||||
|
ch2 = text[pos + 2] - sd.base;
|
||||||
|
Character.UnicodeBlock block2 = Character.UnicodeBlock.of(text[pos + 2]);
|
||||||
|
if (text[pos + 2] == '\u200D') // ZWJ
|
||||||
|
ch2 = 0xFF;
|
||||||
|
else if (block2 != block1) // still allow a 2-char match
|
||||||
|
ch2 = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < decompositions.length; i++)
|
||||||
|
if (decompositions[i][0] == ch0 && (decompositions[i][4] & sd.flag) != 0) {
|
||||||
|
if (decompositions[i][1] == ch1 && (decompositions[i][2] < 0 || decompositions[i][2] == ch2)) {
|
||||||
|
text[pos] = (char) (sd.base + decompositions[i][3]);
|
||||||
|
len = delete(text, pos + 1, len);
|
||||||
|
if (decompositions[i][2] >= 0)
|
||||||
|
len = delete(text, pos + 1, len);
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete a character in-place
|
||||||
|
*/
|
||||||
|
private int delete(char s[], int pos, int len) {
|
||||||
|
if (pos < len)
|
||||||
|
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
|
||||||
|
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,50 @@
|
||||||
|
package org.apache.lucene.analysis.in;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharTokenizer;
|
||||||
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple Tokenizer for text in Indian Languages.
|
||||||
|
*/
|
||||||
|
public final class IndicTokenizer extends CharTokenizer {
|
||||||
|
|
||||||
|
public IndicTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
|
||||||
|
super(matchVersion, factory, input);
|
||||||
|
}
|
||||||
|
|
||||||
|
public IndicTokenizer(Version matchVersion, AttributeSource source, Reader input) {
|
||||||
|
super(matchVersion, source, input);
|
||||||
|
}
|
||||||
|
|
||||||
|
public IndicTokenizer(Version matchVersion, Reader input) {
|
||||||
|
super(matchVersion, input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected boolean isTokenChar(int c) {
|
||||||
|
return Character.isLetter(c)
|
||||||
|
|| Character.getType(c) == Character.NON_SPACING_MARK
|
||||||
|
|| Character.getType(c) == Character.FORMAT
|
||||||
|
|| Character.getType(c) == Character.COMBINING_SPACING_MARK;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Analysis components for Indian languages.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,231 @@
|
||||||
|
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||||
|
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||||
|
अंदर
|
||||||
|
अत
|
||||||
|
अपना
|
||||||
|
अपनी
|
||||||
|
अपने
|
||||||
|
अभी
|
||||||
|
आदि
|
||||||
|
आप
|
||||||
|
इत्यादि
|
||||||
|
इन
|
||||||
|
इनका
|
||||||
|
इन्हीं
|
||||||
|
इन्हें
|
||||||
|
इन्हों
|
||||||
|
इस
|
||||||
|
इसका
|
||||||
|
इसकी
|
||||||
|
इसके
|
||||||
|
इसमें
|
||||||
|
इसी
|
||||||
|
इसे
|
||||||
|
उन
|
||||||
|
उनका
|
||||||
|
उनकी
|
||||||
|
उनके
|
||||||
|
उनको
|
||||||
|
उन्हीं
|
||||||
|
उन्हें
|
||||||
|
उन्हों
|
||||||
|
उस
|
||||||
|
उसके
|
||||||
|
उसी
|
||||||
|
उसे
|
||||||
|
एक
|
||||||
|
एवं
|
||||||
|
एस
|
||||||
|
ऐसे
|
||||||
|
और
|
||||||
|
कई
|
||||||
|
कर
|
||||||
|
करता
|
||||||
|
करते
|
||||||
|
करना
|
||||||
|
करने
|
||||||
|
करें
|
||||||
|
कहते
|
||||||
|
कहा
|
||||||
|
का
|
||||||
|
काफ़ी
|
||||||
|
कि
|
||||||
|
कितना
|
||||||
|
किन्हें
|
||||||
|
किन्हों
|
||||||
|
किया
|
||||||
|
किर
|
||||||
|
किस
|
||||||
|
किसी
|
||||||
|
किसे
|
||||||
|
की
|
||||||
|
कुछ
|
||||||
|
कुल
|
||||||
|
के
|
||||||
|
को
|
||||||
|
कोई
|
||||||
|
कौन
|
||||||
|
कौनसा
|
||||||
|
गया
|
||||||
|
घर
|
||||||
|
जब
|
||||||
|
जहाँ
|
||||||
|
जा
|
||||||
|
जितना
|
||||||
|
जिन
|
||||||
|
जिन्हें
|
||||||
|
जिन्हों
|
||||||
|
जिस
|
||||||
|
जिसे
|
||||||
|
जीधर
|
||||||
|
जैसा
|
||||||
|
जैसे
|
||||||
|
जो
|
||||||
|
तक
|
||||||
|
तब
|
||||||
|
तरह
|
||||||
|
तिन
|
||||||
|
तिन्हें
|
||||||
|
तिन्हों
|
||||||
|
तिस
|
||||||
|
तिसे
|
||||||
|
तो
|
||||||
|
था
|
||||||
|
थी
|
||||||
|
थे
|
||||||
|
दबारा
|
||||||
|
दिया
|
||||||
|
दुसरा
|
||||||
|
दूसरे
|
||||||
|
दो
|
||||||
|
द्वारा
|
||||||
|
न
|
||||||
|
नहीं
|
||||||
|
ना
|
||||||
|
निहायत
|
||||||
|
नीचे
|
||||||
|
ने
|
||||||
|
पर
|
||||||
|
पर
|
||||||
|
पहले
|
||||||
|
पूरा
|
||||||
|
पे
|
||||||
|
फिर
|
||||||
|
बनी
|
||||||
|
बही
|
||||||
|
बहुत
|
||||||
|
बाद
|
||||||
|
बाला
|
||||||
|
बिलकुल
|
||||||
|
भी
|
||||||
|
भीतर
|
||||||
|
मगर
|
||||||
|
मानो
|
||||||
|
मे
|
||||||
|
में
|
||||||
|
यदि
|
||||||
|
यह
|
||||||
|
यहाँ
|
||||||
|
यही
|
||||||
|
या
|
||||||
|
यिह
|
||||||
|
ये
|
||||||
|
रखें
|
||||||
|
रहा
|
||||||
|
रहे
|
||||||
|
ऱ्वासा
|
||||||
|
लिए
|
||||||
|
लिये
|
||||||
|
लेकिन
|
||||||
|
व
|
||||||
|
वर्ग
|
||||||
|
वह
|
||||||
|
वह
|
||||||
|
वहाँ
|
||||||
|
वहीं
|
||||||
|
वाले
|
||||||
|
वुह
|
||||||
|
वे
|
||||||
|
वग़ैरह
|
||||||
|
संग
|
||||||
|
सकता
|
||||||
|
सकते
|
||||||
|
सबसे
|
||||||
|
सभी
|
||||||
|
साथ
|
||||||
|
साबुत
|
||||||
|
साभ
|
||||||
|
सारा
|
||||||
|
से
|
||||||
|
सो
|
||||||
|
ही
|
||||||
|
हुआ
|
||||||
|
हुई
|
||||||
|
हुए
|
||||||
|
है
|
||||||
|
हैं
|
||||||
|
हो
|
||||||
|
होता
|
||||||
|
होती
|
||||||
|
होते
|
||||||
|
होना
|
||||||
|
होने
|
||||||
|
# additional normalized forms of the above
|
||||||
|
अपनि
|
||||||
|
जेसे
|
||||||
|
होति
|
||||||
|
सभि
|
||||||
|
तिंहों
|
||||||
|
इंहों
|
||||||
|
दवारा
|
||||||
|
इसि
|
||||||
|
किंहें
|
||||||
|
थि
|
||||||
|
उंहों
|
||||||
|
ओर
|
||||||
|
जिंहें
|
||||||
|
वहिं
|
||||||
|
अभि
|
||||||
|
बनि
|
||||||
|
हि
|
||||||
|
उंहिं
|
||||||
|
उंहें
|
||||||
|
हें
|
||||||
|
वगेरह
|
||||||
|
एसे
|
||||||
|
रवासा
|
||||||
|
कोन
|
||||||
|
निचे
|
||||||
|
काफि
|
||||||
|
उसि
|
||||||
|
पुरा
|
||||||
|
भितर
|
||||||
|
हे
|
||||||
|
बहि
|
||||||
|
वहां
|
||||||
|
कोइ
|
||||||
|
यहां
|
||||||
|
जिंहों
|
||||||
|
तिंहें
|
||||||
|
किसि
|
||||||
|
कइ
|
||||||
|
यहि
|
||||||
|
इंहिं
|
||||||
|
जिधर
|
||||||
|
इंहें
|
||||||
|
अदि
|
||||||
|
इतयादि
|
||||||
|
हुइ
|
||||||
|
कोनसा
|
||||||
|
इसकि
|
||||||
|
दुसरे
|
||||||
|
जहां
|
||||||
|
अप
|
||||||
|
किंहों
|
||||||
|
उनकि
|
||||||
|
भि
|
||||||
|
वरग
|
||||||
|
हुअ
|
||||||
|
जेसा
|
||||||
|
नहिं
|
|
@ -0,0 +1,51 @@
|
||||||
|
package org.apache.lucene.analysis.hi;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests the HindiAnalyzer
|
||||||
|
*/
|
||||||
|
public class TestHindiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
/** This test fails with NPE when the
|
||||||
|
* stopwords file is missing in classpath */
|
||||||
|
public void testResourcesAvailable() {
|
||||||
|
new HindiAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBasics() throws Exception {
|
||||||
|
Analyzer a = new HindiAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
// two ways to write 'hindi' itself.
|
||||||
|
checkOneTermReuse(a, "हिन्दी", "हिंद");
|
||||||
|
checkOneTermReuse(a, "हिंदी", "हिंद");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testExclusionSet() throws Exception {
|
||||||
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
|
exclusionSet.add("हिंदी");
|
||||||
|
Analyzer a = new HindiAnalyzer(Version.LUCENE_CURRENT,
|
||||||
|
HindiAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
|
checkOneTermReuse(a, "हिंदी", "हिंदी");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,68 @@
|
||||||
|
package org.apache.lucene.analysis.hi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test HindiNormalizer
|
||||||
|
*/
|
||||||
|
public class TestHindiNormalizer extends BaseTokenStreamTestCase {
|
||||||
|
/**
|
||||||
|
* Test some basic normalization, with an example from the paper.
|
||||||
|
*/
|
||||||
|
public void testBasics() throws IOException {
|
||||||
|
check("अँगरेज़ी", "अंगरेजि");
|
||||||
|
check("अँगरेजी", "अंगरेजि");
|
||||||
|
check("अँग्रेज़ी", "अंगरेजि");
|
||||||
|
check("अँग्रेजी", "अंगरेजि");
|
||||||
|
check("अंगरेज़ी", "अंगरेजि");
|
||||||
|
check("अंगरेजी", "अंगरेजि");
|
||||||
|
check("अंग्रेज़ी", "अंगरेजि");
|
||||||
|
check("अंग्रेजी", "अंगरेजि");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDecompositions() throws IOException {
|
||||||
|
// removing nukta dot
|
||||||
|
check("क़िताब", "किताब");
|
||||||
|
check("फ़र्ज़", "फरज");
|
||||||
|
check("क़र्ज़", "करज");
|
||||||
|
// some other composed nukta forms
|
||||||
|
check("ऱऴख़ग़ड़ढ़य़", "रळखगडढय");
|
||||||
|
// removal of format (ZWJ/ZWNJ)
|
||||||
|
check("शार्मा", "शारमा");
|
||||||
|
check("शार्मा", "शारमा");
|
||||||
|
// removal of chandra
|
||||||
|
check("ॅॆॉॊऍऎऑऒ\u0972", "ेेोोएएओओअ");
|
||||||
|
// vowel shortening
|
||||||
|
check("आईऊॠॡऐऔीूॄॣैौ", "अइउऋऌएओिुृॢेो");
|
||||||
|
}
|
||||||
|
private void check(String input, String output) throws IOException {
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT,
|
||||||
|
new StringReader(input));
|
||||||
|
TokenFilter tf = new HindiNormalizationFilter(tokenizer);
|
||||||
|
assertTokenStreamContents(tf, new String[] { output });
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,90 @@
|
||||||
|
package org.apache.lucene.analysis.hi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test HindiStemmer
|
||||||
|
*/
|
||||||
|
public class TestHindiStemmer extends BaseTokenStreamTestCase {
|
||||||
|
/**
|
||||||
|
* Test masc noun inflections
|
||||||
|
*/
|
||||||
|
public void testMasculineNouns() throws IOException {
|
||||||
|
check("लडका", "लडक");
|
||||||
|
check("लडके", "लडक");
|
||||||
|
check("लडकों", "लडक");
|
||||||
|
|
||||||
|
check("गुरु", "गुर");
|
||||||
|
check("गुरुओं", "गुर");
|
||||||
|
|
||||||
|
check("दोस्त", "दोस्त");
|
||||||
|
check("दोस्तों", "दोस्त");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test feminine noun inflections
|
||||||
|
*/
|
||||||
|
public void testFeminineNouns() throws IOException {
|
||||||
|
check("लडकी", "लडक");
|
||||||
|
check("लडकियों", "लडक");
|
||||||
|
|
||||||
|
check("किताब", "किताब");
|
||||||
|
check("किताबें", "किताब");
|
||||||
|
check("किताबों", "किताब");
|
||||||
|
|
||||||
|
check("आध्यापीका", "आध्यापीक");
|
||||||
|
check("आध्यापीकाएं", "आध्यापीक");
|
||||||
|
check("आध्यापीकाओं", "आध्यापीक");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test some verb forms
|
||||||
|
*/
|
||||||
|
public void testVerbs() throws IOException {
|
||||||
|
check("खाना", "खा");
|
||||||
|
check("खाता", "खा");
|
||||||
|
check("खाती", "खा");
|
||||||
|
check("खा", "खा");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* From the paper: since the suffix list for verbs includes AI, awA and anI,
|
||||||
|
* additional suffixes had to be added to the list for noun/adjectives
|
||||||
|
* ending with these endings.
|
||||||
|
*/
|
||||||
|
public void testExceptions() throws IOException {
|
||||||
|
check("कठिनाइयां", "कठिन");
|
||||||
|
check("कठिन", "कठिन");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void check(String input, String output) throws IOException {
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT,
|
||||||
|
new StringReader(input));
|
||||||
|
TokenFilter tf = new HindiStemFilter(tokenizer);
|
||||||
|
assertTokenStreamContents(tf, new String[] { output });
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,53 @@
|
||||||
|
package org.apache.lucene.analysis.in;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test IndicNormalizer
|
||||||
|
*/
|
||||||
|
public class TestIndicNormalizer extends BaseTokenStreamTestCase {
|
||||||
|
/**
|
||||||
|
* Test some basic normalization
|
||||||
|
*/
|
||||||
|
public void testBasics() throws IOException {
|
||||||
|
check("अाॅअाॅ", "ऑऑ");
|
||||||
|
check("अाॆअाॆ", "ऒऒ");
|
||||||
|
check("अाेअाे", "ओओ");
|
||||||
|
check("अाैअाै", "औऔ");
|
||||||
|
check("अाअा", "आआ");
|
||||||
|
check("अाैर", "और");
|
||||||
|
// khanda-ta
|
||||||
|
check("ত্", "ৎ");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void check(String input, String output) throws IOException {
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT,
|
||||||
|
new StringReader(input));
|
||||||
|
TokenFilter tf = new IndicNormalizationFilter(tokenizer);
|
||||||
|
assertTokenStreamContents(tf, new String[] { output });
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,45 @@
|
||||||
|
package org.apache.lucene.analysis.in;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test IndicTokenizer
|
||||||
|
*/
|
||||||
|
public class TestIndicTokenizer extends BaseTokenStreamTestCase {
|
||||||
|
/** Test tokenizing Indic vowels, signs, and punctuation */
|
||||||
|
public void testBasics() throws IOException {
|
||||||
|
TokenStream ts = new IndicTokenizer(Version.LUCENE_CURRENT,
|
||||||
|
new StringReader("मुझे हिंदी का और अभ्यास करना होगा ।"));
|
||||||
|
assertTokenStreamContents(ts,
|
||||||
|
new String[] { "मुझे", "हिंदी", "का", "और", "अभ्यास", "करना", "होगा" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test that words with format chars such as ZWJ are kept */
|
||||||
|
public void testFormat() throws Exception {
|
||||||
|
TokenStream ts = new IndicTokenizer(Version.LUCENE_CURRENT,
|
||||||
|
new StringReader("शार्मा शार्मा"));
|
||||||
|
assertTokenStreamContents(ts, new String[] { "शार्मा", "शार्मा" });
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue