mirror of https://github.com/apache/lucene.git
LUCENE-2234: Hindi Analyzer
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@906468 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1f8951f06c
commit
23d403b6bb
|
@ -28,6 +28,11 @@ stopword list that is BSD-licensed created by Jacques Savoy. The file resides i
|
|||
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt.
|
||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
|
||||
The Hindi analyzer (contrib/analyzers) comes with a default
|
||||
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
|
||||
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt.
|
||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
|
||||
Includes lib/servlet-api-2.4.jar from Apache Tomcat
|
||||
|
||||
The SmartChineseAnalyzer source code (under contrib/analyzers) was
|
||||
|
|
|
@ -103,6 +103,8 @@ New features
|
|||
character is now configurable. Its also up to 20% faster.
|
||||
(Steven Rowe via Robert Muir)
|
||||
|
||||
* LUCENE-2234: Add a Hindi analyzer. (Robert Muir)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||
|
|
|
@ -0,0 +1,132 @@
|
|||
package org.apache.lucene.analysis.hi;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
|
||||
import org.apache.lucene.analysis.in.IndicTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Analyzer for Hindi.
|
||||
*/
|
||||
public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
|
||||
/**
|
||||
* File containing default Hindi stopwords.
|
||||
*
|
||||
* Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
|
||||
* The stopword list is BSD-Licensed.
|
||||
*/
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
private static final String STOPWORDS_COMMENT = "#";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = loadStopwordSet(false, HindiAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param version lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a stemming exclusion set
|
||||
*/
|
||||
public HindiAnalyzer(Version version, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
super(version, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(
|
||||
CharArraySet.copy(matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param version lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public HindiAnalyzer(Version version, Set<?> stopwords) {
|
||||
this(version, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words:
|
||||
* {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public HindiAnalyzer(Version version) {
|
||||
this(version, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
|
||||
* {@link Reader}.
|
||||
*
|
||||
* @return {@link TokenStreamComponents} built from a {@link IndicTokenizer}
|
||||
* filtered with {@link LowerCaseFilter},
|
||||
* {@link IndicNormalizationFilter},
|
||||
* {@link HindiNormalizationFilter},
|
||||
* {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
|
||||
* {@link HindiStemFilter}, and Hindi Stop words
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new IndicTokenizer(matchVersion, reader);
|
||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||
if (!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||
result = new IndicNormalizationFilter(result);
|
||||
result = new HindiNormalizationFilter(result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
result = new HindiStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
package org.apache.lucene.analysis.hi;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that applies {@link HindiNormalizer} to normalize the
|
||||
* orthography.
|
||||
* <p>
|
||||
* In some cases the normalization may cause unrelated terms to conflate, so
|
||||
* to prevent terms from being normalized use an instance of
|
||||
* {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
* @see HindiNormalizer
|
||||
*/
|
||||
public final class HindiNormalizationFilter extends TokenFilter {
|
||||
|
||||
private final HindiNormalizer normalizer = new HindiNormalizer();
|
||||
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
||||
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||
|
||||
public HindiNormalizationFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if (!keywordAtt.isKeyword())
|
||||
termAtt.setTermLength(normalizer.normalize(termAtt.termBuffer(),
|
||||
termAtt.termLength()));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,194 @@
|
|||
package org.apache.lucene.analysis.hi;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Normalizer for Hindi.
|
||||
* <p>
|
||||
* Normalizes text to remove some differences in spelling variations.
|
||||
* <p>
|
||||
* Implements the Hindi-language specific algorithm specified in:
|
||||
* <i>Word normalization in Indian languages</i>
|
||||
* Prasad Pingali and Vasudeva Varma.
|
||||
* http://web2py.iiit.ac.in/publications/default/download/inproceedings.pdf.3fe5b38c-02ee-41ce-9a8f-3e745670be32.pdf
|
||||
* <p>
|
||||
* with the following additions from <i>Hindi CLIR in Thirty Days</i>
|
||||
* Leah S. Larkey, Margaret E. Connell, and Nasreen AbdulJaleel.
|
||||
* http://maroo.cs.umass.edu/pub/web/getpdf.php?id=454:
|
||||
* <ul>
|
||||
* <li>Internal Zero-width joiner and Zero-width non-joiners are removed
|
||||
* <li>In addition to chandrabindu, NA+halant is normalized to anusvara
|
||||
* </ul>
|
||||
*
|
||||
*/
|
||||
public class HindiNormalizer {
|
||||
/**
|
||||
* Normalize an input buffer of Hindi text
|
||||
*
|
||||
* @param s input buffer
|
||||
* @param len length of input buffer
|
||||
* @return length of input buffer after normalization
|
||||
*/
|
||||
public int normalize(char s[], int len) {
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
switch (s[i]) {
|
||||
// dead n -> bindu
|
||||
case '\u0928':
|
||||
if (i + 1 < len && s[i + 1] == '\u094D') {
|
||||
s[i] = '\u0902';
|
||||
len = delete(s, i + 1, len);
|
||||
}
|
||||
break;
|
||||
// candrabindu -> bindu
|
||||
case '\u0901':
|
||||
s[i] = '\u0902';
|
||||
break;
|
||||
// nukta deletions
|
||||
case '\u093C':
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
break;
|
||||
case '\u0929':
|
||||
s[i] = '\u0928';
|
||||
break;
|
||||
case '\u0931':
|
||||
s[i] = '\u0930';
|
||||
break;
|
||||
case '\u0934':
|
||||
s[i] = '\u0933';
|
||||
break;
|
||||
case '\u0958':
|
||||
s[i] = '\u0915';
|
||||
break;
|
||||
case '\u0959':
|
||||
s[i] = '\u0916';
|
||||
break;
|
||||
case '\u095A':
|
||||
s[i] = '\u0917';
|
||||
break;
|
||||
case '\u095B':
|
||||
s[i] = '\u091C';
|
||||
break;
|
||||
case '\u095C':
|
||||
s[i] = '\u0921';
|
||||
break;
|
||||
case '\u095D':
|
||||
s[i] = '\u0922';
|
||||
break;
|
||||
case '\u095E':
|
||||
s[i] = '\u092B';
|
||||
break;
|
||||
case '\u095F':
|
||||
s[i] = '\u092F';
|
||||
break;
|
||||
// zwj/zwnj -> delete
|
||||
case '\u200D':
|
||||
case '\u200C':
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
break;
|
||||
// virama -> delete
|
||||
case '\u094D':
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
break;
|
||||
// chandra/short -> replace
|
||||
case '\u0945':
|
||||
case '\u0946':
|
||||
s[i] = '\u0947';
|
||||
break;
|
||||
case '\u0949':
|
||||
case '\u094A':
|
||||
s[i] = '\u094B';
|
||||
break;
|
||||
case '\u090D':
|
||||
case '\u090E':
|
||||
s[i] = '\u090F';
|
||||
break;
|
||||
case '\u0911':
|
||||
case '\u0912':
|
||||
s[i] = '\u0913';
|
||||
break;
|
||||
case '\u0972':
|
||||
s[i] = '\u0905';
|
||||
break;
|
||||
// long -> short ind. vowels
|
||||
case '\u0906':
|
||||
s[i] = '\u0905';
|
||||
break;
|
||||
case '\u0908':
|
||||
s[i] = '\u0907';
|
||||
break;
|
||||
case '\u090A':
|
||||
s[i] = '\u0909';
|
||||
break;
|
||||
case '\u0960':
|
||||
s[i] = '\u090B';
|
||||
break;
|
||||
case '\u0961':
|
||||
s[i] = '\u090C';
|
||||
break;
|
||||
case '\u0910':
|
||||
s[i] = '\u090F';
|
||||
break;
|
||||
case '\u0914':
|
||||
s[i] = '\u0913';
|
||||
break;
|
||||
// long -> short dep. vowels
|
||||
case '\u0940':
|
||||
s[i] = '\u093F';
|
||||
break;
|
||||
case '\u0942':
|
||||
s[i] = '\u0941';
|
||||
break;
|
||||
case '\u0944':
|
||||
s[i] = '\u0943';
|
||||
break;
|
||||
case '\u0963':
|
||||
s[i] = '\u0962';
|
||||
break;
|
||||
case '\u0948':
|
||||
s[i] = '\u0947';
|
||||
break;
|
||||
case '\u094C':
|
||||
s[i] = '\u094B';
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a character in-place
|
||||
*
|
||||
* @param s Input Buffer
|
||||
* @param pos Position of character to delete
|
||||
* @param len length of input buffer
|
||||
* @return length of input buffer after deletion
|
||||
*/
|
||||
protected int delete(char s[], int pos, int len) {
|
||||
if (pos < len)
|
||||
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
|
||||
|
||||
return len - 1;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
package org.apache.lucene.analysis.hi;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that applies {@link HindiStemmer} to stem Hindi words.
|
||||
*/
|
||||
public final class HindiStemFilter extends TokenFilter {
|
||||
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
||||
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||
private final HindiStemmer stemmer = new HindiStemmer();
|
||||
|
||||
protected HindiStemFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if (!keywordAtt.isKeyword())
|
||||
termAtt.setTermLength(stemmer.stem(termAtt.termBuffer(), termAtt.termLength()));
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,130 @@
|
|||
package org.apache.lucene.analysis.hi;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Light Stemmer for Hindi.
|
||||
* <p>
|
||||
* Implements the algorithm specified in:
|
||||
* <i>A Lightweight Stemmer for Hindi</i>
|
||||
* Ananthakrishnan Ramanathan and Durgesh D Rao.
|
||||
* http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf
|
||||
* </p>
|
||||
*/
|
||||
public class HindiStemmer {
|
||||
public int stem(char buffer[], int len) {
|
||||
// 5
|
||||
if ((len > 6) && (endsWith(buffer, len, "ाएंगी")
|
||||
|| endsWith(buffer, len, "ाएंगे")
|
||||
|| endsWith(buffer, len, "ाऊंगी")
|
||||
|| endsWith(buffer, len, "ाऊंगा")
|
||||
|| endsWith(buffer, len, "ाइयाँ")
|
||||
|| endsWith(buffer, len, "ाइयों")
|
||||
|| endsWith(buffer, len, "ाइयां")
|
||||
))
|
||||
return len - 5;
|
||||
|
||||
// 4
|
||||
if ((len > 5) && (endsWith(buffer, len, "ाएगी")
|
||||
|| endsWith(buffer, len, "ाएगा")
|
||||
|| endsWith(buffer, len, "ाओगी")
|
||||
|| endsWith(buffer, len, "ाओगे")
|
||||
|| endsWith(buffer, len, "एंगी")
|
||||
|| endsWith(buffer, len, "ेंगी")
|
||||
|| endsWith(buffer, len, "एंगे")
|
||||
|| endsWith(buffer, len, "ेंगे")
|
||||
|| endsWith(buffer, len, "ूंगी")
|
||||
|| endsWith(buffer, len, "ूंगा")
|
||||
|| endsWith(buffer, len, "ातीं")
|
||||
|| endsWith(buffer, len, "नाओं")
|
||||
|| endsWith(buffer, len, "नाएं")
|
||||
|| endsWith(buffer, len, "ताओं")
|
||||
|| endsWith(buffer, len, "ताएं")
|
||||
|| endsWith(buffer, len, "ियाँ")
|
||||
|| endsWith(buffer, len, "ियों")
|
||||
|| endsWith(buffer, len, "ियां")
|
||||
))
|
||||
return len - 4;
|
||||
|
||||
// 3
|
||||
if ((len > 4) && (endsWith(buffer, len, "ाकर")
|
||||
|| endsWith(buffer, len, "ाइए")
|
||||
|| endsWith(buffer, len, "ाईं")
|
||||
|| endsWith(buffer, len, "ाया")
|
||||
|| endsWith(buffer, len, "ेगी")
|
||||
|| endsWith(buffer, len, "ेगा")
|
||||
|| endsWith(buffer, len, "ोगी")
|
||||
|| endsWith(buffer, len, "ोगे")
|
||||
|| endsWith(buffer, len, "ाने")
|
||||
|| endsWith(buffer, len, "ाना")
|
||||
|| endsWith(buffer, len, "ाते")
|
||||
|| endsWith(buffer, len, "ाती")
|
||||
|| endsWith(buffer, len, "ाता")
|
||||
|| endsWith(buffer, len, "तीं")
|
||||
|| endsWith(buffer, len, "ाओं")
|
||||
|| endsWith(buffer, len, "ाएं")
|
||||
|| endsWith(buffer, len, "ुओं")
|
||||
|| endsWith(buffer, len, "ुएं")
|
||||
|| endsWith(buffer, len, "ुआं")
|
||||
))
|
||||
return len - 3;
|
||||
|
||||
// 2
|
||||
if ((len > 3) && (endsWith(buffer, len, "कर")
|
||||
|| endsWith(buffer, len, "ाओ")
|
||||
|| endsWith(buffer, len, "िए")
|
||||
|| endsWith(buffer, len, "ाई")
|
||||
|| endsWith(buffer, len, "ाए")
|
||||
|| endsWith(buffer, len, "ने")
|
||||
|| endsWith(buffer, len, "नी")
|
||||
|| endsWith(buffer, len, "ना")
|
||||
|| endsWith(buffer, len, "ते")
|
||||
|| endsWith(buffer, len, "ीं")
|
||||
|| endsWith(buffer, len, "ती")
|
||||
|| endsWith(buffer, len, "ता")
|
||||
|| endsWith(buffer, len, "ाँ")
|
||||
|| endsWith(buffer, len, "ां")
|
||||
|| endsWith(buffer, len, "ों")
|
||||
|| endsWith(buffer, len, "ें")
|
||||
))
|
||||
return len - 2;
|
||||
|
||||
// 1
|
||||
if ((len > 2) && (endsWith(buffer, len, "ो")
|
||||
|| endsWith(buffer, len, "े")
|
||||
|| endsWith(buffer, len, "ू")
|
||||
|| endsWith(buffer, len, "ु")
|
||||
|| endsWith(buffer, len, "ी")
|
||||
|| endsWith(buffer, len, "ि")
|
||||
|| endsWith(buffer, len, "ा")
|
||||
))
|
||||
return len - 1;
|
||||
return len;
|
||||
}
|
||||
|
||||
private boolean endsWith(final char s[], final int len, final String suffix) {
|
||||
final int suffixLen = suffix.length();
|
||||
if (suffixLen > len)
|
||||
return false;
|
||||
for (int i = suffixLen - 1; i >= 0; i--)
|
||||
if (s[len -(suffixLen - i)] != suffix.charAt(i))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Analyzer for Hindi.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,47 @@
|
|||
package org.apache.lucene.analysis.in;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that applies {@link IndicNormalizer} to normalize text
|
||||
* in Indian Languages.
|
||||
*/
|
||||
public final class IndicNormalizationFilter extends TokenFilter {
|
||||
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
||||
private final IndicNormalizer normalizer = new IndicNormalizer();
|
||||
|
||||
public IndicNormalizationFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
termAtt.setTermLength(normalizer.normalize(termAtt.termBuffer(), termAtt.termLength()));
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,303 @@
|
|||
package org.apache.lucene.analysis.in;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.BitSet;
|
||||
import java.util.IdentityHashMap;
|
||||
import static java.lang.Character.UnicodeBlock.*;
|
||||
|
||||
/**
|
||||
* Normalizes the Unicode representation of text in Indian languages.
|
||||
* <p>
|
||||
* Follows guidelines from Unicode 5.2, chapter 6, South Asian Scripts I
|
||||
* and graphical decompositions from http://ldc.upenn.edu/myl/IndianScriptsUnicode.html
|
||||
* </p>
|
||||
*/
|
||||
public class IndicNormalizer {
|
||||
|
||||
private static class ScriptData {
|
||||
final int flag;
|
||||
final int base;
|
||||
BitSet decompMask;
|
||||
|
||||
ScriptData(int flag, int base) {
|
||||
this.flag = flag;
|
||||
this.base = base;
|
||||
}
|
||||
}
|
||||
|
||||
private static final IdentityHashMap<Character.UnicodeBlock,ScriptData> scripts =
|
||||
new IdentityHashMap<Character.UnicodeBlock,ScriptData>(9);
|
||||
|
||||
private static int flag(Character.UnicodeBlock ub) {
|
||||
return scripts.get(ub).flag;
|
||||
}
|
||||
|
||||
static {
|
||||
scripts.put(DEVANAGARI, new ScriptData(1, 0x0900));
|
||||
scripts.put(BENGALI, new ScriptData(2, 0x0980));
|
||||
scripts.put(GURMUKHI, new ScriptData(4, 0x0A00));
|
||||
scripts.put(GUJARATI, new ScriptData(8, 0x0A80));
|
||||
scripts.put(ORIYA, new ScriptData(16, 0x0B00));
|
||||
scripts.put(TAMIL, new ScriptData(32, 0x0B80));
|
||||
scripts.put(TELUGU, new ScriptData(64, 0x0C00));
|
||||
scripts.put(KANNADA, new ScriptData(128, 0x0C80));
|
||||
scripts.put(MALAYALAM, new ScriptData(256, 0x0D00));
|
||||
}
|
||||
|
||||
/**
|
||||
* Decompositions according to Unicode 5.2,
|
||||
* and http://ldc.upenn.edu/myl/IndianScriptsUnicode.html
|
||||
*
|
||||
* Most of these are not handled by unicode normalization anyway.
|
||||
*
|
||||
* The numbers here represent offsets into the respective codepages,
|
||||
* with -1 representing null and 0xFF representing zero-width joiner.
|
||||
*
|
||||
* the columns are: ch1, ch2, ch3, res, flags
|
||||
* ch1, ch2, and ch3 are the decomposition
|
||||
* res is the composition, and flags are the scripts to which it applies.
|
||||
*/
|
||||
private static final int decompositions[][] = {
|
||||
/* devanagari, gujarati vowel candra O */
|
||||
{ 0x05, 0x3E, 0x45, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||
/* devanagari short O */
|
||||
{ 0x05, 0x3E, 0x46, 0x12, flag(DEVANAGARI) },
|
||||
/* devanagari, gujarati letter O */
|
||||
{ 0x05, 0x3E, 0x47, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||
/* devanagari letter AI, gujarati letter AU */
|
||||
{ 0x05, 0x3E, 0x48, 0x14, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||
/* devanagari, bengali, gurmukhi, gujarati, oriya AA */
|
||||
{ 0x05, 0x3E, -1, 0x06, flag(DEVANAGARI) | flag(BENGALI) | flag(GURMUKHI) | flag(GUJARATI) | flag(ORIYA) },
|
||||
/* devanagari letter candra A */
|
||||
{ 0x05, 0x45, -1, 0x72, flag(DEVANAGARI) },
|
||||
/* gujarati vowel candra E */
|
||||
{ 0x05, 0x45, -1, 0x0D, flag(GUJARATI) },
|
||||
/* devanagari letter short A */
|
||||
{ 0x05, 0x46, -1, 0x04, flag(DEVANAGARI) },
|
||||
/* gujarati letter E */
|
||||
{ 0x05, 0x47, -1, 0x0F, flag(GUJARATI) },
|
||||
/* gurmukhi, gujarati letter AI */
|
||||
{ 0x05, 0x48, -1, 0x10, flag(GURMUKHI) | flag(GUJARATI) },
|
||||
/* devanagari, gujarati vowel candra O */
|
||||
{ 0x05, 0x49, -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||
/* devanagari short O */
|
||||
{ 0x05, 0x4A, -1, 0x12, flag(DEVANAGARI) },
|
||||
/* devanagari, gujarati letter O */
|
||||
{ 0x05, 0x4B, -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||
/* devanagari letter AI, gurmukhi letter AU, gujarati letter AU */
|
||||
{ 0x05, 0x4C, -1, 0x14, flag(DEVANAGARI) | flag(GURMUKHI) | flag(GUJARATI) },
|
||||
/* devanagari, gujarati vowel candra O */
|
||||
{ 0x06, 0x45, -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||
/* devanagari short O */
|
||||
{ 0x06, 0x46, -1, 0x12, flag(DEVANAGARI) },
|
||||
/* devanagari, gujarati letter O */
|
||||
{ 0x06, 0x47, -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||
/* devanagari letter AI, gujarati letter AU */
|
||||
{ 0x06, 0x48, -1, 0x14, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||
/* malayalam letter II */
|
||||
{ 0x07, 0x57, -1, 0x08, flag(MALAYALAM) },
|
||||
/* devanagari letter UU */
|
||||
{ 0x09, 0x41, -1, 0x0A, flag(DEVANAGARI) },
|
||||
/* tamil, malayalam letter UU (some styles) */
|
||||
{ 0x09, 0x57, -1, 0x0A, flag(TAMIL) | flag(MALAYALAM) },
|
||||
/* malayalam letter AI */
|
||||
{ 0x0E, 0x46, -1, 0x10, flag(MALAYALAM) },
|
||||
/* devanagari candra E */
|
||||
{ 0x0F, 0x45, -1, 0x0D, flag(DEVANAGARI) },
|
||||
/* devanagari short E */
|
||||
{ 0x0F, 0x46, -1, 0x0E, flag(DEVANAGARI) },
|
||||
/* devanagari AI */
|
||||
{ 0x0F, 0x47, -1, 0x10, flag(DEVANAGARI) },
|
||||
/* oriya AI */
|
||||
{ 0x0F, 0x57, -1, 0x10, flag(ORIYA) },
|
||||
/* malayalam letter OO */
|
||||
{ 0x12, 0x3E, -1, 0x13, flag(MALAYALAM) },
|
||||
/* telugu, kannada letter AU */
|
||||
{ 0x12, 0x4C, -1, 0x14, flag(TELUGU) | flag(KANNADA) },
|
||||
/* telugu letter OO */
|
||||
{ 0x12, 0x55, -1, 0x13, flag(TELUGU) },
|
||||
/* tamil, malayalam letter AU */
|
||||
{ 0x12, 0x57, -1, 0x14, flag(TAMIL) | flag(MALAYALAM) },
|
||||
/* oriya letter AU */
|
||||
{ 0x13, 0x57, -1, 0x14, flag(ORIYA) },
|
||||
/* devanagari qa */
|
||||
{ 0x15, 0x3C, -1, 0x58, flag(DEVANAGARI) },
|
||||
/* devanagari, gurmukhi khha */
|
||||
{ 0x16, 0x3C, -1, 0x59, flag(DEVANAGARI) | flag(GURMUKHI) },
|
||||
/* devanagari, gurmukhi ghha */
|
||||
{ 0x17, 0x3C, -1, 0x5A, flag(DEVANAGARI) | flag(GURMUKHI) },
|
||||
/* devanagari, gurmukhi za */
|
||||
{ 0x1C, 0x3C, -1, 0x5B, flag(DEVANAGARI) | flag(GURMUKHI) },
|
||||
/* devanagari dddha, bengali, oriya rra */
|
||||
{ 0x21, 0x3C, -1, 0x5C, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
|
||||
/* devanagari, bengali, oriya rha */
|
||||
{ 0x22, 0x3C, -1, 0x5D, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
|
||||
/* malayalam chillu nn */
|
||||
{ 0x23, 0x4D, 0xFF, 0x7A, flag(MALAYALAM) },
|
||||
/* bengali khanda ta */
|
||||
{ 0x24, 0x4D, 0xFF, 0x4E, flag(BENGALI) },
|
||||
/* devanagari nnna */
|
||||
{ 0x28, 0x3C, -1, 0x29, flag(DEVANAGARI) },
|
||||
/* malayalam chillu n */
|
||||
{ 0x28, 0x4D, 0xFF, 0x7B, flag(MALAYALAM) },
|
||||
/* devanagari, gurmukhi fa */
|
||||
{ 0x2B, 0x3C, -1, 0x5E, flag(DEVANAGARI) | flag(GURMUKHI) },
|
||||
/* devanagari, bengali yya */
|
||||
{ 0x2F, 0x3C, -1, 0x5F, flag(DEVANAGARI) | flag(BENGALI) },
|
||||
/* telugu letter vocalic R */
|
||||
{ 0x2C, 0x41, 0x41, 0x0B, flag(TELUGU) },
|
||||
/* devanagari rra */
|
||||
{ 0x30, 0x3C, -1, 0x31, flag(DEVANAGARI) },
|
||||
/* malayalam chillu rr */
|
||||
{ 0x30, 0x4D, 0xFF, 0x7C, flag(MALAYALAM) },
|
||||
/* malayalam chillu l */
|
||||
{ 0x32, 0x4D, 0xFF, 0x7D, flag(MALAYALAM) },
|
||||
/* devanagari llla */
|
||||
{ 0x33, 0x3C, -1, 0x34, flag(DEVANAGARI) },
|
||||
/* malayalam chillu ll */
|
||||
{ 0x33, 0x4D, 0xFF, 0x7E, flag(MALAYALAM) },
|
||||
/* telugu letter MA */
|
||||
{ 0x35, 0x41, -1, 0x2E, flag(TELUGU) },
|
||||
/* devanagari, gujarati vowel sign candra O */
|
||||
{ 0x3E, 0x45, -1, 0x49, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||
/* devanagari vowel sign short O */
|
||||
{ 0x3E, 0x46, -1, 0x4A, flag(DEVANAGARI) },
|
||||
/* devanagari, gujarati vowel sign O */
|
||||
{ 0x3E, 0x47, -1, 0x4B, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||
/* devanagari, gujarati vowel sign AU */
|
||||
{ 0x3E, 0x48, -1, 0x4C, flag(DEVANAGARI) | flag(GUJARATI) },
|
||||
/* kannada vowel sign II */
|
||||
{ 0x3F, 0x55, -1, 0x40, flag(KANNADA) },
|
||||
/* gurmukhi vowel sign UU (when stacking) */
|
||||
{ 0x41, 0x41, -1, 0x42, flag(GURMUKHI) },
|
||||
/* tamil, malayalam vowel sign O */
|
||||
{ 0x46, 0x3E, -1, 0x4A, flag(TAMIL) | flag(MALAYALAM) },
|
||||
/* kannada vowel sign OO */
|
||||
{ 0x46, 0x42, 0x55, 0x4B, flag(KANNADA) },
|
||||
/* kannada vowel sign O */
|
||||
{ 0x46, 0x42, -1, 0x4A, flag(KANNADA) },
|
||||
/* malayalam vowel sign AI (if reordered twice) */
|
||||
{ 0x46, 0x46, -1, 0x48, flag(MALAYALAM) },
|
||||
/* telugu, kannada vowel sign EE */
|
||||
{ 0x46, 0x55, -1, 0x47, flag(TELUGU) | flag(KANNADA) },
|
||||
/* telugu, kannada vowel sign AI */
|
||||
{ 0x46, 0x56, -1, 0x48, flag(TELUGU) | flag(KANNADA) },
|
||||
/* tamil, malayalam vowel sign AU */
|
||||
{ 0x46, 0x57, -1, 0x4C, flag(TAMIL) | flag(MALAYALAM) },
|
||||
/* bengali, oriya vowel sign O, tamil, malayalam vowel sign OO */
|
||||
{ 0x47, 0x3E, -1, 0x4B, flag(BENGALI) | flag(ORIYA) | flag(TAMIL) | flag(MALAYALAM) },
|
||||
/* bengali, oriya vowel sign AU */
|
||||
{ 0x47, 0x57, -1, 0x4C, flag(BENGALI) | flag(ORIYA) },
|
||||
/* kannada vowel sign OO */
|
||||
{ 0x4A, 0x55, -1, 0x4B, flag(KANNADA) },
|
||||
/* gurmukhi letter I */
|
||||
{ 0x72, 0x3F, -1, 0x07, flag(GURMUKHI) },
|
||||
/* gurmukhi letter II */
|
||||
{ 0x72, 0x40, -1, 0x08, flag(GURMUKHI) },
|
||||
/* gurmukhi letter EE */
|
||||
{ 0x72, 0x47, -1, 0x0F, flag(GURMUKHI) },
|
||||
/* gurmukhi letter U */
|
||||
{ 0x73, 0x41, -1, 0x09, flag(GURMUKHI) },
|
||||
/* gurmukhi letter UU */
|
||||
{ 0x73, 0x42, -1, 0x0A, flag(GURMUKHI) },
|
||||
/* gurmukhi letter OO */
|
||||
{ 0x73, 0x4B, -1, 0x13, flag(GURMUKHI) },
|
||||
};
|
||||
|
||||
static {
|
||||
for (ScriptData sd : scripts.values()) {
|
||||
sd.decompMask = new BitSet(0x7F);
|
||||
for (int i = 0; i < decompositions.length; i++) {
|
||||
final int ch = decompositions[i][0];
|
||||
final int flags = decompositions[i][4];
|
||||
if ((flags & sd.flag) != 0)
|
||||
sd.decompMask.set(ch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes input text, and returns the new length.
|
||||
* The length will always be less than or equal to the existing length.
|
||||
*
|
||||
* @param text input text
|
||||
* @param len valid length
|
||||
* @return normalized length
|
||||
*/
|
||||
public int normalize(char text[], int len) {
|
||||
for (int i = 0; i < len; i++) {
|
||||
final Character.UnicodeBlock block = Character.UnicodeBlock.of(text[i]);
|
||||
final ScriptData sd = scripts.get(block);
|
||||
if (sd != null) {
|
||||
final int ch = text[i] - sd.base;
|
||||
if (sd.decompMask.get(ch))
|
||||
len = compose(ch, block, sd, text, i, len);
|
||||
}
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compose into standard form any compositions in the decompositions table.
|
||||
*/
|
||||
private int compose(int ch0, Character.UnicodeBlock block0, ScriptData sd,
|
||||
char text[], int pos, int len) {
|
||||
if (pos + 1 >= len) /* need at least 2 chars! */
|
||||
return len;
|
||||
|
||||
final int ch1 = text[pos + 1] - sd.base;
|
||||
final Character.UnicodeBlock block1 = Character.UnicodeBlock.of(text[pos + 1]);
|
||||
if (block1 != block0) /* needs to be the same writing system */
|
||||
return len;
|
||||
|
||||
int ch2 = -1;
|
||||
|
||||
if (pos + 2 < len) {
|
||||
ch2 = text[pos + 2] - sd.base;
|
||||
Character.UnicodeBlock block2 = Character.UnicodeBlock.of(text[pos + 2]);
|
||||
if (text[pos + 2] == '\u200D') // ZWJ
|
||||
ch2 = 0xFF;
|
||||
else if (block2 != block1) // still allow a 2-char match
|
||||
ch2 = -1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < decompositions.length; i++)
|
||||
if (decompositions[i][0] == ch0 && (decompositions[i][4] & sd.flag) != 0) {
|
||||
if (decompositions[i][1] == ch1 && (decompositions[i][2] < 0 || decompositions[i][2] == ch2)) {
|
||||
text[pos] = (char) (sd.base + decompositions[i][3]);
|
||||
len = delete(text, pos + 1, len);
|
||||
if (decompositions[i][2] >= 0)
|
||||
len = delete(text, pos + 1, len);
|
||||
return len;
|
||||
}
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a character in-place
|
||||
*/
|
||||
private int delete(char s[], int pos, int len) {
|
||||
if (pos < len)
|
||||
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
|
||||
|
||||
return len - 1;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
package org.apache.lucene.analysis.in;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.CharTokenizer;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Simple Tokenizer for text in Indian Languages.
|
||||
*/
|
||||
public final class IndicTokenizer extends CharTokenizer {
|
||||
|
||||
public IndicTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
|
||||
super(matchVersion, factory, input);
|
||||
}
|
||||
|
||||
public IndicTokenizer(Version matchVersion, AttributeSource source, Reader input) {
|
||||
super(matchVersion, source, input);
|
||||
}
|
||||
|
||||
public IndicTokenizer(Version matchVersion, Reader input) {
|
||||
super(matchVersion, input);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean isTokenChar(int c) {
|
||||
return Character.isLetter(c)
|
||||
|| Character.getType(c) == Character.NON_SPACING_MARK
|
||||
|| Character.getType(c) == Character.FORMAT
|
||||
|| Character.getType(c) == Character.COMBINING_SPACING_MARK;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Analysis components for Indian languages.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,231 @@
|
|||
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||
अंदर
|
||||
अत
|
||||
अपना
|
||||
अपनी
|
||||
अपने
|
||||
अभी
|
||||
आदि
|
||||
आप
|
||||
इत्यादि
|
||||
इन
|
||||
इनका
|
||||
इन्हीं
|
||||
इन्हें
|
||||
इन्हों
|
||||
इस
|
||||
इसका
|
||||
इसकी
|
||||
इसके
|
||||
इसमें
|
||||
इसी
|
||||
इसे
|
||||
उन
|
||||
उनका
|
||||
उनकी
|
||||
उनके
|
||||
उनको
|
||||
उन्हीं
|
||||
उन्हें
|
||||
उन्हों
|
||||
उस
|
||||
उसके
|
||||
उसी
|
||||
उसे
|
||||
एक
|
||||
एवं
|
||||
एस
|
||||
ऐसे
|
||||
और
|
||||
कई
|
||||
कर
|
||||
करता
|
||||
करते
|
||||
करना
|
||||
करने
|
||||
करें
|
||||
कहते
|
||||
कहा
|
||||
का
|
||||
काफ़ी
|
||||
कि
|
||||
कितना
|
||||
किन्हें
|
||||
किन्हों
|
||||
किया
|
||||
किर
|
||||
किस
|
||||
किसी
|
||||
किसे
|
||||
की
|
||||
कुछ
|
||||
कुल
|
||||
के
|
||||
को
|
||||
कोई
|
||||
कौन
|
||||
कौनसा
|
||||
गया
|
||||
घर
|
||||
जब
|
||||
जहाँ
|
||||
जा
|
||||
जितना
|
||||
जिन
|
||||
जिन्हें
|
||||
जिन्हों
|
||||
जिस
|
||||
जिसे
|
||||
जीधर
|
||||
जैसा
|
||||
जैसे
|
||||
जो
|
||||
तक
|
||||
तब
|
||||
तरह
|
||||
तिन
|
||||
तिन्हें
|
||||
तिन्हों
|
||||
तिस
|
||||
तिसे
|
||||
तो
|
||||
था
|
||||
थी
|
||||
थे
|
||||
दबारा
|
||||
दिया
|
||||
दुसरा
|
||||
दूसरे
|
||||
दो
|
||||
द्वारा
|
||||
न
|
||||
नहीं
|
||||
ना
|
||||
निहायत
|
||||
नीचे
|
||||
ने
|
||||
पर
|
||||
पर
|
||||
पहले
|
||||
पूरा
|
||||
पे
|
||||
फिर
|
||||
बनी
|
||||
बही
|
||||
बहुत
|
||||
बाद
|
||||
बाला
|
||||
बिलकुल
|
||||
भी
|
||||
भीतर
|
||||
मगर
|
||||
मानो
|
||||
मे
|
||||
में
|
||||
यदि
|
||||
यह
|
||||
यहाँ
|
||||
यही
|
||||
या
|
||||
यिह
|
||||
ये
|
||||
रखें
|
||||
रहा
|
||||
रहे
|
||||
ऱ्वासा
|
||||
लिए
|
||||
लिये
|
||||
लेकिन
|
||||
व
|
||||
वर्ग
|
||||
वह
|
||||
वह
|
||||
वहाँ
|
||||
वहीं
|
||||
वाले
|
||||
वुह
|
||||
वे
|
||||
वग़ैरह
|
||||
संग
|
||||
सकता
|
||||
सकते
|
||||
सबसे
|
||||
सभी
|
||||
साथ
|
||||
साबुत
|
||||
साभ
|
||||
सारा
|
||||
से
|
||||
सो
|
||||
ही
|
||||
हुआ
|
||||
हुई
|
||||
हुए
|
||||
है
|
||||
हैं
|
||||
हो
|
||||
होता
|
||||
होती
|
||||
होते
|
||||
होना
|
||||
होने
|
||||
# additional normalized forms of the above
|
||||
अपनि
|
||||
जेसे
|
||||
होति
|
||||
सभि
|
||||
तिंहों
|
||||
इंहों
|
||||
दवारा
|
||||
इसि
|
||||
किंहें
|
||||
थि
|
||||
उंहों
|
||||
ओर
|
||||
जिंहें
|
||||
वहिं
|
||||
अभि
|
||||
बनि
|
||||
हि
|
||||
उंहिं
|
||||
उंहें
|
||||
हें
|
||||
वगेरह
|
||||
एसे
|
||||
रवासा
|
||||
कोन
|
||||
निचे
|
||||
काफि
|
||||
उसि
|
||||
पुरा
|
||||
भितर
|
||||
हे
|
||||
बहि
|
||||
वहां
|
||||
कोइ
|
||||
यहां
|
||||
जिंहों
|
||||
तिंहें
|
||||
किसि
|
||||
कइ
|
||||
यहि
|
||||
इंहिं
|
||||
जिधर
|
||||
इंहें
|
||||
अदि
|
||||
इतयादि
|
||||
हुइ
|
||||
कोनसा
|
||||
इसकि
|
||||
दुसरे
|
||||
जहां
|
||||
अप
|
||||
किंहों
|
||||
उनकि
|
||||
भि
|
||||
वरग
|
||||
हुअ
|
||||
जेसा
|
||||
नहिं
|
|
@ -0,0 +1,51 @@
|
|||
package org.apache.lucene.analysis.hi;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Tests the HindiAnalyzer
|
||||
*/
|
||||
public class TestHindiAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
* stopwords file is missing in classpath */
|
||||
public void testResourcesAvailable() {
|
||||
new HindiAnalyzer(Version.LUCENE_CURRENT);
|
||||
}
|
||||
|
||||
public void testBasics() throws Exception {
|
||||
Analyzer a = new HindiAnalyzer(Version.LUCENE_CURRENT);
|
||||
// two ways to write 'hindi' itself.
|
||||
checkOneTermReuse(a, "हिन्दी", "हिंद");
|
||||
checkOneTermReuse(a, "हिंदी", "हिंद");
|
||||
}
|
||||
|
||||
public void testExclusionSet() throws Exception {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("हिंदी");
|
||||
Analyzer a = new HindiAnalyzer(Version.LUCENE_CURRENT,
|
||||
HindiAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "हिंदी", "हिंदी");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,68 @@
|
|||
package org.apache.lucene.analysis.hi;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Test HindiNormalizer
|
||||
*/
|
||||
public class TestHindiNormalizer extends BaseTokenStreamTestCase {
|
||||
/**
|
||||
* Test some basic normalization, with an example from the paper.
|
||||
*/
|
||||
public void testBasics() throws IOException {
|
||||
check("अँगरेज़ी", "अंगरेजि");
|
||||
check("अँगरेजी", "अंगरेजि");
|
||||
check("अँग्रेज़ी", "अंगरेजि");
|
||||
check("अँग्रेजी", "अंगरेजि");
|
||||
check("अंगरेज़ी", "अंगरेजि");
|
||||
check("अंगरेजी", "अंगरेजि");
|
||||
check("अंग्रेज़ी", "अंगरेजि");
|
||||
check("अंग्रेजी", "अंगरेजि");
|
||||
}
|
||||
|
||||
public void testDecompositions() throws IOException {
|
||||
// removing nukta dot
|
||||
check("क़िताब", "किताब");
|
||||
check("फ़र्ज़", "फरज");
|
||||
check("क़र्ज़", "करज");
|
||||
// some other composed nukta forms
|
||||
check("ऱऴख़ग़ड़ढ़य़", "रळखगडढय");
|
||||
// removal of format (ZWJ/ZWNJ)
|
||||
check("शार्मा", "शारमा");
|
||||
check("शार्मा", "शारमा");
|
||||
// removal of chandra
|
||||
check("ॅॆॉॊऍऎऑऒ\u0972", "ेेोोएएओओअ");
|
||||
// vowel shortening
|
||||
check("आईऊॠॡऐऔीूॄॣैौ", "अइउऋऌएओिुृॢेो");
|
||||
}
|
||||
private void check(String input, String output) throws IOException {
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT,
|
||||
new StringReader(input));
|
||||
TokenFilter tf = new HindiNormalizationFilter(tokenizer);
|
||||
assertTokenStreamContents(tf, new String[] { output });
|
||||
}
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
package org.apache.lucene.analysis.hi;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Test HindiStemmer
|
||||
*/
|
||||
public class TestHindiStemmer extends BaseTokenStreamTestCase {
|
||||
/**
|
||||
* Test masc noun inflections
|
||||
*/
|
||||
public void testMasculineNouns() throws IOException {
|
||||
check("लडका", "लडक");
|
||||
check("लडके", "लडक");
|
||||
check("लडकों", "लडक");
|
||||
|
||||
check("गुरु", "गुर");
|
||||
check("गुरुओं", "गुर");
|
||||
|
||||
check("दोस्त", "दोस्त");
|
||||
check("दोस्तों", "दोस्त");
|
||||
}
|
||||
|
||||
/**
|
||||
* Test feminine noun inflections
|
||||
*/
|
||||
public void testFeminineNouns() throws IOException {
|
||||
check("लडकी", "लडक");
|
||||
check("लडकियों", "लडक");
|
||||
|
||||
check("किताब", "किताब");
|
||||
check("किताबें", "किताब");
|
||||
check("किताबों", "किताब");
|
||||
|
||||
check("आध्यापीका", "आध्यापीक");
|
||||
check("आध्यापीकाएं", "आध्यापीक");
|
||||
check("आध्यापीकाओं", "आध्यापीक");
|
||||
}
|
||||
|
||||
/**
|
||||
* Test some verb forms
|
||||
*/
|
||||
public void testVerbs() throws IOException {
|
||||
check("खाना", "खा");
|
||||
check("खाता", "खा");
|
||||
check("खाती", "खा");
|
||||
check("खा", "खा");
|
||||
}
|
||||
|
||||
/**
|
||||
* From the paper: since the suffix list for verbs includes AI, awA and anI,
|
||||
* additional suffixes had to be added to the list for noun/adjectives
|
||||
* ending with these endings.
|
||||
*/
|
||||
public void testExceptions() throws IOException {
|
||||
check("कठिनाइयां", "कठिन");
|
||||
check("कठिन", "कठिन");
|
||||
}
|
||||
|
||||
private void check(String input, String output) throws IOException {
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT,
|
||||
new StringReader(input));
|
||||
TokenFilter tf = new HindiStemFilter(tokenizer);
|
||||
assertTokenStreamContents(tf, new String[] { output });
|
||||
}
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
package org.apache.lucene.analysis.in;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Test IndicNormalizer
|
||||
*/
|
||||
public class TestIndicNormalizer extends BaseTokenStreamTestCase {
|
||||
/**
|
||||
* Test some basic normalization
|
||||
*/
|
||||
public void testBasics() throws IOException {
|
||||
check("अाॅअाॅ", "ऑऑ");
|
||||
check("अाॆअाॆ", "ऒऒ");
|
||||
check("अाेअाे", "ओओ");
|
||||
check("अाैअाै", "औऔ");
|
||||
check("अाअा", "आआ");
|
||||
check("अाैर", "और");
|
||||
// khanda-ta
|
||||
check("ত্", "ৎ");
|
||||
}
|
||||
|
||||
private void check(String input, String output) throws IOException {
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT,
|
||||
new StringReader(input));
|
||||
TokenFilter tf = new IndicNormalizationFilter(tokenizer);
|
||||
assertTokenStreamContents(tf, new String[] { output });
|
||||
}
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
package org.apache.lucene.analysis.in;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Test IndicTokenizer
|
||||
*/
|
||||
public class TestIndicTokenizer extends BaseTokenStreamTestCase {
|
||||
/** Test tokenizing Indic vowels, signs, and punctuation */
|
||||
public void testBasics() throws IOException {
|
||||
TokenStream ts = new IndicTokenizer(Version.LUCENE_CURRENT,
|
||||
new StringReader("मुझे हिंदी का और अभ्यास करना होगा ।"));
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "मुझे", "हिंदी", "का", "और", "अभ्यास", "करना", "होगा" });
|
||||
}
|
||||
|
||||
/** Test that words with format chars such as ZWJ are kept */
|
||||
public void testFormat() throws Exception {
|
||||
TokenStream ts = new IndicTokenizer(Version.LUCENE_CURRENT,
|
||||
new StringReader("शार्मा शार्मा"));
|
||||
assertTokenStreamContents(ts, new String[] { "शार्मा", "शार्मा" });
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue