LUCENE-2234: Hindi Analyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@906468 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-02-04 12:41:56 +00:00
parent 1f8951f06c
commit 23d403b6bb
18 changed files with 1553 additions and 0 deletions

View File

@ -28,6 +28,11 @@ stopword list that is BSD-licensed created by Jacques Savoy. The file resides i
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt. contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt.
See http://members.unine.ch/jacques.savoy/clef/index.html. See http://members.unine.ch/jacques.savoy/clef/index.html.
The Hindi analyzer (contrib/analyzers) comes with a default
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt.
See http://members.unine.ch/jacques.savoy/clef/index.html.
Includes lib/servlet-api-2.4.jar from Apache Tomcat Includes lib/servlet-api-2.4.jar from Apache Tomcat
The SmartChineseAnalyzer source code (under contrib/analyzers) was The SmartChineseAnalyzer source code (under contrib/analyzers) was

View File

@ -103,6 +103,8 @@ New features
character is now configurable. Its also up to 20% faster. character is now configurable. Its also up to 20% faster.
(Steven Rowe via Robert Muir) (Steven Rowe via Robert Muir)
* LUCENE-2234: Add a Hindi analyzer. (Robert Muir)
Build Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation * LUCENE-2124: Moved the JDK-based collation support from contrib/collation

View File

@ -0,0 +1,132 @@
package org.apache.lucene.analysis.hi;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
import org.apache.lucene.analysis.in.IndicTokenizer;
import org.apache.lucene.util.Version;
/**
* Analyzer for Hindi.
*/
public final class HindiAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet;
/**
* File containing default Hindi stopwords.
*
* Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
* The stopword list is BSD-Licensed.
*/
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
private static final String STOPWORDS_COMMENT = "#";
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
public static Set<?> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
/**
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET = loadStopwordSet(false, HindiAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
}
/**
* Builds an analyzer with the given stop words
*
* @param version lucene compatibility version
* @param stopwords a stopword set
* @param stemExclusionSet a stemming exclusion set
*/
public HindiAnalyzer(Version version, Set<?> stopwords, Set<?> stemExclusionSet) {
super(version, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(
CharArraySet.copy(matchVersion, stemExclusionSet));
}
/**
* Builds an analyzer with the given stop words
*
* @param version lucene compatibility version
* @param stopwords a stopword set
*/
public HindiAnalyzer(Version version, Set<?> stopwords) {
this(version, stopwords, CharArraySet.EMPTY_SET);
}
/**
* Builds an analyzer with the default stop words:
* {@link #DEFAULT_STOPWORD_FILE}.
*/
public HindiAnalyzer(Version version) {
this(version, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
* {@link Reader}.
*
* @return {@link TokenStreamComponents} built from a {@link IndicTokenizer}
* filtered with {@link LowerCaseFilter},
* {@link IndicNormalizationFilter},
* {@link HindiNormalizationFilter},
* {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
* {@link HindiStemFilter}, and Hindi Stop words
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new IndicTokenizer(matchVersion, reader);
TokenStream result = new LowerCaseFilter(matchVersion, source);
if (!stemExclusionSet.isEmpty())
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
result = new IndicNormalizationFilter(result);
result = new HindiNormalizationFilter(result);
result = new StopFilter(matchVersion, result, stopwords);
result = new HindiStemFilter(result);
return new TokenStreamComponents(source, result);
}
}

View File

@ -0,0 +1,59 @@
package org.apache.lucene.analysis.hi;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* A {@link TokenFilter} that applies {@link HindiNormalizer} to normalize the
* orthography.
* <p>
* In some cases the normalization may cause unrelated terms to conflate, so
* to prevent terms from being normalized use an instance of
* {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* @see HindiNormalizer
*/
public final class HindiNormalizationFilter extends TokenFilter {
private final HindiNormalizer normalizer = new HindiNormalizer();
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
public HindiNormalizationFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAtt.isKeyword())
termAtt.setTermLength(normalizer.normalize(termAtt.termBuffer(),
termAtt.termLength()));
return true;
}
return false;
}
}

View File

@ -0,0 +1,194 @@
package org.apache.lucene.analysis.hi;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Normalizer for Hindi.
* <p>
* Normalizes text to remove some differences in spelling variations.
* <p>
* Implements the Hindi-language specific algorithm specified in:
* <i>Word normalization in Indian languages</i>
* Prasad Pingali and Vasudeva Varma.
* http://web2py.iiit.ac.in/publications/default/download/inproceedings.pdf.3fe5b38c-02ee-41ce-9a8f-3e745670be32.pdf
* <p>
* with the following additions from <i>Hindi CLIR in Thirty Days</i>
* Leah S. Larkey, Margaret E. Connell, and Nasreen AbdulJaleel.
* http://maroo.cs.umass.edu/pub/web/getpdf.php?id=454:
* <ul>
* <li>Internal Zero-width joiner and Zero-width non-joiners are removed
* <li>In addition to chandrabindu, NA+halant is normalized to anusvara
* </ul>
*
*/
public class HindiNormalizer {
/**
* Normalize an input buffer of Hindi text
*
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
*/
public int normalize(char s[], int len) {
for (int i = 0; i < len; i++) {
switch (s[i]) {
// dead n -> bindu
case '\u0928':
if (i + 1 < len && s[i + 1] == '\u094D') {
s[i] = '\u0902';
len = delete(s, i + 1, len);
}
break;
// candrabindu -> bindu
case '\u0901':
s[i] = '\u0902';
break;
// nukta deletions
case '\u093C':
len = delete(s, i, len);
i--;
break;
case '\u0929':
s[i] = '\u0928';
break;
case '\u0931':
s[i] = '\u0930';
break;
case '\u0934':
s[i] = '\u0933';
break;
case '\u0958':
s[i] = '\u0915';
break;
case '\u0959':
s[i] = '\u0916';
break;
case '\u095A':
s[i] = '\u0917';
break;
case '\u095B':
s[i] = '\u091C';
break;
case '\u095C':
s[i] = '\u0921';
break;
case '\u095D':
s[i] = '\u0922';
break;
case '\u095E':
s[i] = '\u092B';
break;
case '\u095F':
s[i] = '\u092F';
break;
// zwj/zwnj -> delete
case '\u200D':
case '\u200C':
len = delete(s, i, len);
i--;
break;
// virama -> delete
case '\u094D':
len = delete(s, i, len);
i--;
break;
// chandra/short -> replace
case '\u0945':
case '\u0946':
s[i] = '\u0947';
break;
case '\u0949':
case '\u094A':
s[i] = '\u094B';
break;
case '\u090D':
case '\u090E':
s[i] = '\u090F';
break;
case '\u0911':
case '\u0912':
s[i] = '\u0913';
break;
case '\u0972':
s[i] = '\u0905';
break;
// long -> short ind. vowels
case '\u0906':
s[i] = '\u0905';
break;
case '\u0908':
s[i] = '\u0907';
break;
case '\u090A':
s[i] = '\u0909';
break;
case '\u0960':
s[i] = '\u090B';
break;
case '\u0961':
s[i] = '\u090C';
break;
case '\u0910':
s[i] = '\u090F';
break;
case '\u0914':
s[i] = '\u0913';
break;
// long -> short dep. vowels
case '\u0940':
s[i] = '\u093F';
break;
case '\u0942':
s[i] = '\u0941';
break;
case '\u0944':
s[i] = '\u0943';
break;
case '\u0963':
s[i] = '\u0962';
break;
case '\u0948':
s[i] = '\u0947';
break;
case '\u094C':
s[i] = '\u094B';
break;
default:
break;
}
}
return len;
}
/**
* Delete a character in-place
*
* @param s Input Buffer
* @param pos Position of character to delete
* @param len length of input buffer
* @return length of input buffer after deletion
*/
protected int delete(char s[], int pos, int len) {
if (pos < len)
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
return len - 1;
}
}

View File

@ -0,0 +1,49 @@
package org.apache.lucene.analysis.hi;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* A {@link TokenFilter} that applies {@link HindiStemmer} to stem Hindi words.
*/
public final class HindiStemFilter extends TokenFilter {
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
private final HindiStemmer stemmer = new HindiStemmer();
protected HindiStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAtt.isKeyword())
termAtt.setTermLength(stemmer.stem(termAtt.termBuffer(), termAtt.termLength()));
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,130 @@
package org.apache.lucene.analysis.hi;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Light Stemmer for Hindi.
* <p>
* Implements the algorithm specified in:
* <i>A Lightweight Stemmer for Hindi</i>
* Ananthakrishnan Ramanathan and Durgesh D Rao.
* http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf
* </p>
*/
public class HindiStemmer {
public int stem(char buffer[], int len) {
// 5
if ((len > 6) && (endsWith(buffer, len, "ाएंगी")
|| endsWith(buffer, len, "ाएंगे")
|| endsWith(buffer, len, "ाऊंगी")
|| endsWith(buffer, len, "ाऊंगा")
|| endsWith(buffer, len, "ाइयाँ")
|| endsWith(buffer, len, "ाइयों")
|| endsWith(buffer, len, "ाइयां")
))
return len - 5;
// 4
if ((len > 5) && (endsWith(buffer, len, "ाएगी")
|| endsWith(buffer, len, "ाएगा")
|| endsWith(buffer, len, "ाओगी")
|| endsWith(buffer, len, "ाओगे")
|| endsWith(buffer, len, "एंगी")
|| endsWith(buffer, len, "ेंगी")
|| endsWith(buffer, len, "एंगे")
|| endsWith(buffer, len, "ेंगे")
|| endsWith(buffer, len, "ूंगी")
|| endsWith(buffer, len, "ूंगा")
|| endsWith(buffer, len, "ातीं")
|| endsWith(buffer, len, "नाओं")
|| endsWith(buffer, len, "नाएं")
|| endsWith(buffer, len, "ताओं")
|| endsWith(buffer, len, "ताएं")
|| endsWith(buffer, len, "ियाँ")
|| endsWith(buffer, len, "ियों")
|| endsWith(buffer, len, "ियां")
))
return len - 4;
// 3
if ((len > 4) && (endsWith(buffer, len, "ाकर")
|| endsWith(buffer, len, "ाइए")
|| endsWith(buffer, len, "ाईं")
|| endsWith(buffer, len, "ाया")
|| endsWith(buffer, len, "ेगी")
|| endsWith(buffer, len, "ेगा")
|| endsWith(buffer, len, "ोगी")
|| endsWith(buffer, len, "ोगे")
|| endsWith(buffer, len, "ाने")
|| endsWith(buffer, len, "ाना")
|| endsWith(buffer, len, "ाते")
|| endsWith(buffer, len, "ाती")
|| endsWith(buffer, len, "ाता")
|| endsWith(buffer, len, "तीं")
|| endsWith(buffer, len, "ाओं")
|| endsWith(buffer, len, "ाएं")
|| endsWith(buffer, len, "ुओं")
|| endsWith(buffer, len, "ुएं")
|| endsWith(buffer, len, "ुआं")
))
return len - 3;
// 2
if ((len > 3) && (endsWith(buffer, len, "कर")
|| endsWith(buffer, len, "ाओ")
|| endsWith(buffer, len, "िए")
|| endsWith(buffer, len, "ाई")
|| endsWith(buffer, len, "ाए")
|| endsWith(buffer, len, "ने")
|| endsWith(buffer, len, "नी")
|| endsWith(buffer, len, "ना")
|| endsWith(buffer, len, "ते")
|| endsWith(buffer, len, "ीं")
|| endsWith(buffer, len, "ती")
|| endsWith(buffer, len, "ता")
|| endsWith(buffer, len, "ाँ")
|| endsWith(buffer, len, "ां")
|| endsWith(buffer, len, "ों")
|| endsWith(buffer, len, "ें")
))
return len - 2;
// 1
if ((len > 2) && (endsWith(buffer, len, "")
|| endsWith(buffer, len, "")
|| endsWith(buffer, len, "")
|| endsWith(buffer, len, "")
|| endsWith(buffer, len, "")
|| endsWith(buffer, len, "ि")
|| endsWith(buffer, len, "")
))
return len - 1;
return len;
}
private boolean endsWith(final char s[], final int len, final String suffix) {
final int suffixLen = suffix.length();
if (suffixLen > len)
return false;
for (int i = suffixLen - 1; i >= 0; i--)
if (s[len -(suffixLen - i)] != suffix.charAt(i))
return false;
return true;
}
}

View File

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Analyzer for Hindi.
</body>
</html>

View File

@ -0,0 +1,47 @@
package org.apache.lucene.analysis.in;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* A {@link TokenFilter} that applies {@link IndicNormalizer} to normalize text
* in Indian Languages.
*/
public final class IndicNormalizationFilter extends TokenFilter {
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
private final IndicNormalizer normalizer = new IndicNormalizer();
public IndicNormalizationFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
termAtt.setTermLength(normalizer.normalize(termAtt.termBuffer(), termAtt.termLength()));
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,303 @@
package org.apache.lucene.analysis.in;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.BitSet;
import java.util.IdentityHashMap;
import static java.lang.Character.UnicodeBlock.*;
/**
* Normalizes the Unicode representation of text in Indian languages.
* <p>
* Follows guidelines from Unicode 5.2, chapter 6, South Asian Scripts I
* and graphical decompositions from http://ldc.upenn.edu/myl/IndianScriptsUnicode.html
* </p>
*/
public class IndicNormalizer {
private static class ScriptData {
final int flag;
final int base;
BitSet decompMask;
ScriptData(int flag, int base) {
this.flag = flag;
this.base = base;
}
}
private static final IdentityHashMap<Character.UnicodeBlock,ScriptData> scripts =
new IdentityHashMap<Character.UnicodeBlock,ScriptData>(9);
private static int flag(Character.UnicodeBlock ub) {
return scripts.get(ub).flag;
}
static {
scripts.put(DEVANAGARI, new ScriptData(1, 0x0900));
scripts.put(BENGALI, new ScriptData(2, 0x0980));
scripts.put(GURMUKHI, new ScriptData(4, 0x0A00));
scripts.put(GUJARATI, new ScriptData(8, 0x0A80));
scripts.put(ORIYA, new ScriptData(16, 0x0B00));
scripts.put(TAMIL, new ScriptData(32, 0x0B80));
scripts.put(TELUGU, new ScriptData(64, 0x0C00));
scripts.put(KANNADA, new ScriptData(128, 0x0C80));
scripts.put(MALAYALAM, new ScriptData(256, 0x0D00));
}
/**
* Decompositions according to Unicode 5.2,
* and http://ldc.upenn.edu/myl/IndianScriptsUnicode.html
*
* Most of these are not handled by unicode normalization anyway.
*
* The numbers here represent offsets into the respective codepages,
* with -1 representing null and 0xFF representing zero-width joiner.
*
* the columns are: ch1, ch2, ch3, res, flags
* ch1, ch2, and ch3 are the decomposition
* res is the composition, and flags are the scripts to which it applies.
*/
private static final int decompositions[][] = {
/* devanagari, gujarati vowel candra O */
{ 0x05, 0x3E, 0x45, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
/* devanagari short O */
{ 0x05, 0x3E, 0x46, 0x12, flag(DEVANAGARI) },
/* devanagari, gujarati letter O */
{ 0x05, 0x3E, 0x47, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
/* devanagari letter AI, gujarati letter AU */
{ 0x05, 0x3E, 0x48, 0x14, flag(DEVANAGARI) | flag(GUJARATI) },
/* devanagari, bengali, gurmukhi, gujarati, oriya AA */
{ 0x05, 0x3E, -1, 0x06, flag(DEVANAGARI) | flag(BENGALI) | flag(GURMUKHI) | flag(GUJARATI) | flag(ORIYA) },
/* devanagari letter candra A */
{ 0x05, 0x45, -1, 0x72, flag(DEVANAGARI) },
/* gujarati vowel candra E */
{ 0x05, 0x45, -1, 0x0D, flag(GUJARATI) },
/* devanagari letter short A */
{ 0x05, 0x46, -1, 0x04, flag(DEVANAGARI) },
/* gujarati letter E */
{ 0x05, 0x47, -1, 0x0F, flag(GUJARATI) },
/* gurmukhi, gujarati letter AI */
{ 0x05, 0x48, -1, 0x10, flag(GURMUKHI) | flag(GUJARATI) },
/* devanagari, gujarati vowel candra O */
{ 0x05, 0x49, -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
/* devanagari short O */
{ 0x05, 0x4A, -1, 0x12, flag(DEVANAGARI) },
/* devanagari, gujarati letter O */
{ 0x05, 0x4B, -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
/* devanagari letter AI, gurmukhi letter AU, gujarati letter AU */
{ 0x05, 0x4C, -1, 0x14, flag(DEVANAGARI) | flag(GURMUKHI) | flag(GUJARATI) },
/* devanagari, gujarati vowel candra O */
{ 0x06, 0x45, -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
/* devanagari short O */
{ 0x06, 0x46, -1, 0x12, flag(DEVANAGARI) },
/* devanagari, gujarati letter O */
{ 0x06, 0x47, -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
/* devanagari letter AI, gujarati letter AU */
{ 0x06, 0x48, -1, 0x14, flag(DEVANAGARI) | flag(GUJARATI) },
/* malayalam letter II */
{ 0x07, 0x57, -1, 0x08, flag(MALAYALAM) },
/* devanagari letter UU */
{ 0x09, 0x41, -1, 0x0A, flag(DEVANAGARI) },
/* tamil, malayalam letter UU (some styles) */
{ 0x09, 0x57, -1, 0x0A, flag(TAMIL) | flag(MALAYALAM) },
/* malayalam letter AI */
{ 0x0E, 0x46, -1, 0x10, flag(MALAYALAM) },
/* devanagari candra E */
{ 0x0F, 0x45, -1, 0x0D, flag(DEVANAGARI) },
/* devanagari short E */
{ 0x0F, 0x46, -1, 0x0E, flag(DEVANAGARI) },
/* devanagari AI */
{ 0x0F, 0x47, -1, 0x10, flag(DEVANAGARI) },
/* oriya AI */
{ 0x0F, 0x57, -1, 0x10, flag(ORIYA) },
/* malayalam letter OO */
{ 0x12, 0x3E, -1, 0x13, flag(MALAYALAM) },
/* telugu, kannada letter AU */
{ 0x12, 0x4C, -1, 0x14, flag(TELUGU) | flag(KANNADA) },
/* telugu letter OO */
{ 0x12, 0x55, -1, 0x13, flag(TELUGU) },
/* tamil, malayalam letter AU */
{ 0x12, 0x57, -1, 0x14, flag(TAMIL) | flag(MALAYALAM) },
/* oriya letter AU */
{ 0x13, 0x57, -1, 0x14, flag(ORIYA) },
/* devanagari qa */
{ 0x15, 0x3C, -1, 0x58, flag(DEVANAGARI) },
/* devanagari, gurmukhi khha */
{ 0x16, 0x3C, -1, 0x59, flag(DEVANAGARI) | flag(GURMUKHI) },
/* devanagari, gurmukhi ghha */
{ 0x17, 0x3C, -1, 0x5A, flag(DEVANAGARI) | flag(GURMUKHI) },
/* devanagari, gurmukhi za */
{ 0x1C, 0x3C, -1, 0x5B, flag(DEVANAGARI) | flag(GURMUKHI) },
/* devanagari dddha, bengali, oriya rra */
{ 0x21, 0x3C, -1, 0x5C, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
/* devanagari, bengali, oriya rha */
{ 0x22, 0x3C, -1, 0x5D, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
/* malayalam chillu nn */
{ 0x23, 0x4D, 0xFF, 0x7A, flag(MALAYALAM) },
/* bengali khanda ta */
{ 0x24, 0x4D, 0xFF, 0x4E, flag(BENGALI) },
/* devanagari nnna */
{ 0x28, 0x3C, -1, 0x29, flag(DEVANAGARI) },
/* malayalam chillu n */
{ 0x28, 0x4D, 0xFF, 0x7B, flag(MALAYALAM) },
/* devanagari, gurmukhi fa */
{ 0x2B, 0x3C, -1, 0x5E, flag(DEVANAGARI) | flag(GURMUKHI) },
/* devanagari, bengali yya */
{ 0x2F, 0x3C, -1, 0x5F, flag(DEVANAGARI) | flag(BENGALI) },
/* telugu letter vocalic R */
{ 0x2C, 0x41, 0x41, 0x0B, flag(TELUGU) },
/* devanagari rra */
{ 0x30, 0x3C, -1, 0x31, flag(DEVANAGARI) },
/* malayalam chillu rr */
{ 0x30, 0x4D, 0xFF, 0x7C, flag(MALAYALAM) },
/* malayalam chillu l */
{ 0x32, 0x4D, 0xFF, 0x7D, flag(MALAYALAM) },
/* devanagari llla */
{ 0x33, 0x3C, -1, 0x34, flag(DEVANAGARI) },
/* malayalam chillu ll */
{ 0x33, 0x4D, 0xFF, 0x7E, flag(MALAYALAM) },
/* telugu letter MA */
{ 0x35, 0x41, -1, 0x2E, flag(TELUGU) },
/* devanagari, gujarati vowel sign candra O */
{ 0x3E, 0x45, -1, 0x49, flag(DEVANAGARI) | flag(GUJARATI) },
/* devanagari vowel sign short O */
{ 0x3E, 0x46, -1, 0x4A, flag(DEVANAGARI) },
/* devanagari, gujarati vowel sign O */
{ 0x3E, 0x47, -1, 0x4B, flag(DEVANAGARI) | flag(GUJARATI) },
/* devanagari, gujarati vowel sign AU */
{ 0x3E, 0x48, -1, 0x4C, flag(DEVANAGARI) | flag(GUJARATI) },
/* kannada vowel sign II */
{ 0x3F, 0x55, -1, 0x40, flag(KANNADA) },
/* gurmukhi vowel sign UU (when stacking) */
{ 0x41, 0x41, -1, 0x42, flag(GURMUKHI) },
/* tamil, malayalam vowel sign O */
{ 0x46, 0x3E, -1, 0x4A, flag(TAMIL) | flag(MALAYALAM) },
/* kannada vowel sign OO */
{ 0x46, 0x42, 0x55, 0x4B, flag(KANNADA) },
/* kannada vowel sign O */
{ 0x46, 0x42, -1, 0x4A, flag(KANNADA) },
/* malayalam vowel sign AI (if reordered twice) */
{ 0x46, 0x46, -1, 0x48, flag(MALAYALAM) },
/* telugu, kannada vowel sign EE */
{ 0x46, 0x55, -1, 0x47, flag(TELUGU) | flag(KANNADA) },
/* telugu, kannada vowel sign AI */
{ 0x46, 0x56, -1, 0x48, flag(TELUGU) | flag(KANNADA) },
/* tamil, malayalam vowel sign AU */
{ 0x46, 0x57, -1, 0x4C, flag(TAMIL) | flag(MALAYALAM) },
/* bengali, oriya vowel sign O, tamil, malayalam vowel sign OO */
{ 0x47, 0x3E, -1, 0x4B, flag(BENGALI) | flag(ORIYA) | flag(TAMIL) | flag(MALAYALAM) },
/* bengali, oriya vowel sign AU */
{ 0x47, 0x57, -1, 0x4C, flag(BENGALI) | flag(ORIYA) },
/* kannada vowel sign OO */
{ 0x4A, 0x55, -1, 0x4B, flag(KANNADA) },
/* gurmukhi letter I */
{ 0x72, 0x3F, -1, 0x07, flag(GURMUKHI) },
/* gurmukhi letter II */
{ 0x72, 0x40, -1, 0x08, flag(GURMUKHI) },
/* gurmukhi letter EE */
{ 0x72, 0x47, -1, 0x0F, flag(GURMUKHI) },
/* gurmukhi letter U */
{ 0x73, 0x41, -1, 0x09, flag(GURMUKHI) },
/* gurmukhi letter UU */
{ 0x73, 0x42, -1, 0x0A, flag(GURMUKHI) },
/* gurmukhi letter OO */
{ 0x73, 0x4B, -1, 0x13, flag(GURMUKHI) },
};
static {
for (ScriptData sd : scripts.values()) {
sd.decompMask = new BitSet(0x7F);
for (int i = 0; i < decompositions.length; i++) {
final int ch = decompositions[i][0];
final int flags = decompositions[i][4];
if ((flags & sd.flag) != 0)
sd.decompMask.set(ch);
}
}
}
/**
* Normalizes input text, and returns the new length.
* The length will always be less than or equal to the existing length.
*
* @param text input text
* @param len valid length
* @return normalized length
*/
public int normalize(char text[], int len) {
for (int i = 0; i < len; i++) {
final Character.UnicodeBlock block = Character.UnicodeBlock.of(text[i]);
final ScriptData sd = scripts.get(block);
if (sd != null) {
final int ch = text[i] - sd.base;
if (sd.decompMask.get(ch))
len = compose(ch, block, sd, text, i, len);
}
}
return len;
}
/**
* Compose into standard form any compositions in the decompositions table.
*/
private int compose(int ch0, Character.UnicodeBlock block0, ScriptData sd,
char text[], int pos, int len) {
if (pos + 1 >= len) /* need at least 2 chars! */
return len;
final int ch1 = text[pos + 1] - sd.base;
final Character.UnicodeBlock block1 = Character.UnicodeBlock.of(text[pos + 1]);
if (block1 != block0) /* needs to be the same writing system */
return len;
int ch2 = -1;
if (pos + 2 < len) {
ch2 = text[pos + 2] - sd.base;
Character.UnicodeBlock block2 = Character.UnicodeBlock.of(text[pos + 2]);
if (text[pos + 2] == '\u200D') // ZWJ
ch2 = 0xFF;
else if (block2 != block1) // still allow a 2-char match
ch2 = -1;
}
for (int i = 0; i < decompositions.length; i++)
if (decompositions[i][0] == ch0 && (decompositions[i][4] & sd.flag) != 0) {
if (decompositions[i][1] == ch1 && (decompositions[i][2] < 0 || decompositions[i][2] == ch2)) {
text[pos] = (char) (sd.base + decompositions[i][3]);
len = delete(text, pos + 1, len);
if (decompositions[i][2] >= 0)
len = delete(text, pos + 1, len);
return len;
}
}
return len;
}
/**
* Delete a character in-place
*/
private int delete(char s[], int pos, int len) {
if (pos < len)
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
return len - 1;
}
}

View File

@ -0,0 +1,50 @@
package org.apache.lucene.analysis.in;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
/**
* Simple Tokenizer for text in Indian Languages.
*/
public final class IndicTokenizer extends CharTokenizer {
public IndicTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
super(matchVersion, factory, input);
}
public IndicTokenizer(Version matchVersion, AttributeSource source, Reader input) {
super(matchVersion, source, input);
}
public IndicTokenizer(Version matchVersion, Reader input) {
super(matchVersion, input);
}
@Override
protected boolean isTokenChar(int c) {
return Character.isLetter(c)
|| Character.getType(c) == Character.NON_SPACING_MARK
|| Character.getType(c) == Character.FORMAT
|| Character.getType(c) == Character.COMBINING_SPACING_MARK;
}
}

View File

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Analysis components for Indian languages.
</body>
</html>

View File

@ -0,0 +1,231 @@
# Also see http://www.opensource.org/licenses/bsd-license.html
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# This file was created by Jacques Savoy and is distributed under the BSD license.
अंदर
अत
अपना
अपनी
अपने
अभी
आदि
आप
इत्यादि
इन
इनका
इन्हीं
इन्हें
इन्हों
इस
इसका
इसकी
इसके
इसमें
इसी
इसे
उन
उनका
उनकी
उनके
उनको
उन्हीं
उन्हें
उन्हों
उस
उसके
उसी
उसे
एक
एवं
एस
ऐसे
और
कई
कर
करता
करते
करना
करने
करें
कहते
कहा
का
काफ़ी
कि
कितना
किन्हें
किन्हों
किया
किर
किस
किसी
किसे
की
कुछ
कुल
के
को
कोई
कौन
कौनसा
गया
घर
जब
जहाँ
जा
जितना
जिन
जिन्हें
जिन्हों
जिस
जिसे
जीधर
जैसा
जैसे
जो
तक
तब
तरह
तिन
तिन्हें
तिन्हों
तिस
तिसे
तो
था
थी
थे
दबारा
दिया
दुसरा
दूसरे
दो
द्वारा
नहीं
ना
निहायत
नीचे
ने
पर
पर
पहले
पूरा
पे
फिर
बनी
बही
बहुत
बाद
बाला
बिलकुल
भी
भीतर
मगर
मानो
मे
में
यदि
यह
यहाँ
यही
या
यिह
ये
रखें
रहा
रहे
ऱ्वासा
लिए
लिये
लेकिन
वर्ग
वह
वह
वहाँ
वहीं
वाले
वुह
वे
वग़ैरह
संग
सकता
सकते
सबसे
सभी
साथ
साबुत
साभ
सारा
से
सो
ही
हुआ
हुई
हुए
है
हैं
हो
होता
होती
होते
होना
होने
# additional normalized forms of the above
अपनि
जेसे
होति
सभि
तिंहों
इंहों
दवारा
इसि
किंहें
थि
उंहों
ओर
जिंहें
वहिं
अभि
बनि
हि
उंहिं
उंहें
हें
वगेरह
एसे
रवासा
कोन
निचे
काफि
उसि
पुरा
भितर
हे
बहि
वहां
कोइ
यहां
जिंहों
तिंहें
किसि
कइ
यहि
इंहिं
जिधर
इंहें
अदि
इतयादि
हुइ
कोनसा
इसकि
दुसरे
जहां
अप
किंहों
उनकि
भि
वरग
हुअ
जेसा
नहिं

View File

@ -0,0 +1,51 @@
package org.apache.lucene.analysis.hi;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.Version;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Tests the HindiAnalyzer
*/
public class TestHindiAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
* stopwords file is missing in classpath */
public void testResourcesAvailable() {
new HindiAnalyzer(Version.LUCENE_CURRENT);
}
public void testBasics() throws Exception {
Analyzer a = new HindiAnalyzer(Version.LUCENE_CURRENT);
// two ways to write 'hindi' itself.
checkOneTermReuse(a, "हिन्दी", "हिंद");
checkOneTermReuse(a, "हिंदी", "हिंद");
}
public void testExclusionSet() throws Exception {
Set<String> exclusionSet = new HashSet<String>();
exclusionSet.add("हिंदी");
Analyzer a = new HindiAnalyzer(Version.LUCENE_CURRENT,
HindiAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "हिंदी", "हिंदी");
}
}

View File

@ -0,0 +1,68 @@
package org.apache.lucene.analysis.hi;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.util.Version;
/**
* Test HindiNormalizer
*/
public class TestHindiNormalizer extends BaseTokenStreamTestCase {
/**
* Test some basic normalization, with an example from the paper.
*/
public void testBasics() throws IOException {
check("अँगरेज़ी", "अंगरेजि");
check("अँगरेजी", "अंगरेजि");
check("अँग्रेज़ी", "अंगरेजि");
check("अँग्रेजी", "अंगरेजि");
check("अंगरेज़ी", "अंगरेजि");
check("अंगरेजी", "अंगरेजि");
check("अंग्रेज़ी", "अंगरेजि");
check("अंग्रेजी", "अंगरेजि");
}
public void testDecompositions() throws IOException {
// removing nukta dot
check("क़िताब", "किताब");
check("फ़र्ज़", "फरज");
check("क़र्ज़", "करज");
// some other composed nukta forms
check("ऱऴख़ग़ड़ढ़य़", "रळखगडढय");
// removal of format (ZWJ/ZWNJ)
check("शार्‍मा", "शारमा");
check("शार्‌मा", "शारमा");
// removal of chandra
check("ॅॆॉॊऍऎऑऒ\u0972", "ेेोोएएओओअ");
// vowel shortening
check("आईऊॠॡऐऔीूॄॣैौ", "अइउऋऌएओिुृॢेो");
}
private void check(String input, String output) throws IOException {
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT,
new StringReader(input));
TokenFilter tf = new HindiNormalizationFilter(tokenizer);
assertTokenStreamContents(tf, new String[] { output });
}
}

View File

@ -0,0 +1,90 @@
package org.apache.lucene.analysis.hi;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.util.Version;
/**
* Test HindiStemmer
*/
public class TestHindiStemmer extends BaseTokenStreamTestCase {
/**
* Test masc noun inflections
*/
public void testMasculineNouns() throws IOException {
check("लडका", "लडक");
check("लडके", "लडक");
check("लडकों", "लडक");
check("गुरु", "गुर");
check("गुरुओं", "गुर");
check("दोस्त", "दोस्त");
check("दोस्तों", "दोस्त");
}
/**
* Test feminine noun inflections
*/
public void testFeminineNouns() throws IOException {
check("लडकी", "लडक");
check("लडकियों", "लडक");
check("किताब", "किताब");
check("किताबें", "किताब");
check("किताबों", "किताब");
check("आध्यापीका", "आध्यापीक");
check("आध्यापीकाएं", "आध्यापीक");
check("आध्यापीकाओं", "आध्यापीक");
}
/**
* Test some verb forms
*/
public void testVerbs() throws IOException {
check("खाना", "खा");
check("खाता", "खा");
check("खाती", "खा");
check("खा", "खा");
}
/**
* From the paper: since the suffix list for verbs includes AI, awA and anI,
* additional suffixes had to be added to the list for noun/adjectives
* ending with these endings.
*/
public void testExceptions() throws IOException {
check("कठिनाइयां", "कठिन");
check("कठिन", "कठिन");
}
private void check(String input, String output) throws IOException {
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT,
new StringReader(input));
TokenFilter tf = new HindiStemFilter(tokenizer);
assertTokenStreamContents(tf, new String[] { output });
}
}

View File

@ -0,0 +1,53 @@
package org.apache.lucene.analysis.in;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.util.Version;
/**
* Test IndicNormalizer
*/
public class TestIndicNormalizer extends BaseTokenStreamTestCase {
/**
* Test some basic normalization
*/
public void testBasics() throws IOException {
check("अाॅअाॅ", "ऑऑ");
check("अाॆअाॆ", "ऒऒ");
check("अाेअाे", "ओओ");
check("अाैअाै", "औऔ");
check("अाअा", "आआ");
check("अाैर", "और");
// khanda-ta
check("ত্‍", "");
}
private void check(String input, String output) throws IOException {
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT,
new StringReader(input));
TokenFilter tf = new IndicNormalizationFilter(tokenizer);
assertTokenStreamContents(tf, new String[] { output });
}
}

View File

@ -0,0 +1,45 @@
package org.apache.lucene.analysis.in;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
/**
* Test IndicTokenizer
*/
public class TestIndicTokenizer extends BaseTokenStreamTestCase {
/** Test tokenizing Indic vowels, signs, and punctuation */
public void testBasics() throws IOException {
TokenStream ts = new IndicTokenizer(Version.LUCENE_CURRENT,
new StringReader("मुझे हिंदी का और अभ्यास करना होगा ।"));
assertTokenStreamContents(ts,
new String[] { "मुझे", "हिंदी", "का", "और", "अभ्यास", "करना", "होगा" });
}
/** Test that words with format chars such as ZWJ are kept */
public void testFormat() throws Exception {
TokenStream ts = new IndicTokenizer(Version.LUCENE_CURRENT,
new StringReader("शार्‍मा शार्‍मा"));
assertTokenStreamContents(ts, new String[] { "शार्‍मा", "शार्‍मा" });
}
}