LUCENE-2234: Hindi Analyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@906468 13f79535-47bb-0310-9956-ffa450edef68
2010-02-04 12:41:56 +00:00 · 2010-02-04 12:41:56 +00:00 · 23d403b6bb
parent 1f8951f06c
commit 23d403b6bb
18 changed files with 1553 additions and 0 deletions
--- a/NOTICE.txt
+++ b/NOTICE.txt
@ -28,6 +28,11 @@ stopword list that is BSD-licensed created by Jacques Savoy.  The file resides i
 contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt.
 See http://members.unine.ch/jacques.savoy/clef/index.html.

+The Hindi analyzer (contrib/analyzers) comes with a default
+stopword list that is BSD-licensed created by Jacques Savoy.  The file resides in
+contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt.
+See http://members.unine.ch/jacques.savoy/clef/index.html.
+
 Includes lib/servlet-api-2.4.jar from  Apache Tomcat

 The SmartChineseAnalyzer source code (under contrib/analyzers) was
--- a/contrib/CHANGES.txt
+++ b/contrib/CHANGES.txt
@ -103,6 +103,8 @@ New features
   character is now configurable. Its also up to 20% faster. 
   (Steven Rowe via Robert Muir)

+ * LUCENE-2234: Add a Hindi analyzer.  (Robert Muir)
+
 Build

 * LUCENE-2124: Moved the JDK-based collation support from contrib/collation 
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
@ -0,0 +1,132 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.in.IndicNormalizationFilter;
+import org.apache.lucene.analysis.in.IndicTokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * Analyzer for Hindi.
+ */
+public final class HindiAnalyzer extends StopwordAnalyzerBase {
+  private final Set<?> stemExclusionSet;
+  
+  /**
+   * File containing default Hindi stopwords.
+   * 
+   * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
+   * The stopword list is BSD-Licensed.
+   */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+  private static final String STOPWORDS_COMMENT = "#";
+  
+  /**
+   * Returns an unmodifiable instance of the default stop-words set.
+   * @return an unmodifiable instance of the default stop-words set.
+   */
+  public static Set<?> getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final Set<?> DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = loadStopwordSet(false, HindiAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words
+   * 
+   * @param version lucene compatibility version
+   * @param stopwords a stopword set
+   * @param stemExclusionSet a stemming exclusion set
+   */
+  public HindiAnalyzer(Version version, Set<?> stopwords, Set<?> stemExclusionSet) {
+    super(version, stopwords);
+    this.stemExclusionSet = CharArraySet.unmodifiableSet(
+        CharArraySet.copy(matchVersion, stemExclusionSet));
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words 
+   * 
+   * @param version lucene compatibility version
+   * @param stopwords a stopword set
+   */
+  public HindiAnalyzer(Version version, Set<?> stopwords) {
+    this(version, stopwords, CharArraySet.EMPTY_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the default stop words:
+   * {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public HindiAnalyzer(Version version) {
+    this(version, DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+
+  /**
+   * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
+   * {@link Reader}.
+   * 
+   * @return {@link TokenStreamComponents} built from a {@link IndicTokenizer}
+   *         filtered with {@link LowerCaseFilter}, 
+   *         {@link IndicNormalizationFilter},
+   *         {@link HindiNormalizationFilter},
+   *         {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
+   *         {@link HindiStemFilter}, and Hindi Stop words
+   */
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new IndicTokenizer(matchVersion, reader);
+    TokenStream result = new LowerCaseFilter(matchVersion, source);
+    if (!stemExclusionSet.isEmpty())
+      result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+    result = new IndicNormalizationFilter(result);
+    result = new HindiNormalizationFilter(result);
+    result = new StopFilter(matchVersion, result, stopwords);
+    result = new HindiStemFilter(result);
+    return new TokenStreamComponents(source, result);
+  }
+}
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilter.java
@ -0,0 +1,59 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link HindiNormalizer} to normalize the
+ * orthography.
+ * <p>
+ * In some cases the normalization may cause unrelated terms to conflate, so
+ * to prevent terms from being normalized use an instance of
+ * {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * @see HindiNormalizer
+ */
+public final class HindiNormalizationFilter extends TokenFilter {
+
+  private final HindiNormalizer normalizer = new HindiNormalizer();
+  private final TermAttribute termAtt = addAttribute(TermAttribute.class);
+  private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+  
+  public HindiNormalizationFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAtt.isKeyword())
+        termAtt.setTermLength(normalizer.normalize(termAtt.termBuffer(), 
+            termAtt.termLength()));
+      return true;
+    } 
+    return false;
+  }
+}
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java
@ -0,0 +1,194 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Normalizer for Hindi.
+ * <p>
+ * Normalizes text to remove some differences in spelling variations.
+ * <p>
+ * Implements the Hindi-language specific algorithm specified in:
+ * <i>Word normalization in Indian languages</i>
+ * Prasad Pingali and Vasudeva Varma.
+ * http://web2py.iiit.ac.in/publications/default/download/inproceedings.pdf.3fe5b38c-02ee-41ce-9a8f-3e745670be32.pdf
+ * <p>
+ * with the following additions from <i>Hindi CLIR in Thirty Days</i>
+ * Leah S. Larkey, Margaret E. Connell, and Nasreen AbdulJaleel.
+ * http://maroo.cs.umass.edu/pub/web/getpdf.php?id=454:
+ * <ul>
+ *  <li>Internal Zero-width joiner and Zero-width non-joiners are removed
+ *  <li>In addition to chandrabindu, NA+halant is normalized to anusvara
+ * </ul>
+ * 
+ */
+public class HindiNormalizer {
+  /**
+   * Normalize an input buffer of Hindi text
+   * 
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   */
+  public int normalize(char s[], int len) {
+
+    for (int i = 0; i < len; i++) {
+      switch (s[i]) {
+        // dead n -> bindu
+      case '\u0928':
+        if (i + 1 < len && s[i + 1] == '\u094D') {
+          s[i] = '\u0902';
+          len = delete(s, i + 1, len);
+        }
+        break;
+      // candrabindu -> bindu
+      case '\u0901':
+        s[i] = '\u0902';
+        break;
+      // nukta deletions
+      case '\u093C':
+        len = delete(s, i, len);
+        i--;
+        break;      
+      case '\u0929':
+        s[i] = '\u0928';
+        break;
+      case '\u0931':
+        s[i] = '\u0930';
+        break;
+      case '\u0934':
+        s[i] = '\u0933';
+        break;
+      case '\u0958':
+        s[i] = '\u0915';
+        break;
+      case '\u0959':
+        s[i] = '\u0916';
+        break;
+      case '\u095A':
+        s[i] = '\u0917';
+        break;
+      case '\u095B':
+        s[i] = '\u091C';
+        break;
+      case '\u095C':
+        s[i] = '\u0921';
+        break;
+      case '\u095D':
+        s[i] = '\u0922';
+        break;
+      case '\u095E':
+        s[i] = '\u092B';
+        break;
+      case '\u095F':
+        s[i] = '\u092F';
+        break;
+        // zwj/zwnj -> delete
+      case '\u200D':
+      case '\u200C':
+        len = delete(s, i, len);
+        i--;
+        break;
+        // virama -> delete
+      case '\u094D':
+        len = delete(s, i, len);
+        i--;
+        break;
+        // chandra/short -> replace
+      case '\u0945':
+      case '\u0946':
+        s[i] = '\u0947';
+        break;
+      case '\u0949':
+      case '\u094A':
+        s[i] = '\u094B';
+        break;
+      case '\u090D':
+      case '\u090E':
+        s[i] = '\u090F';
+        break;
+      case '\u0911':
+      case '\u0912':
+        s[i] = '\u0913';
+        break;
+      case '\u0972':
+        s[i] = '\u0905';
+        break;
+        // long -> short ind. vowels
+      case '\u0906':
+        s[i] = '\u0905';
+        break;
+      case '\u0908':
+        s[i] = '\u0907';
+        break;
+      case '\u090A':
+        s[i] = '\u0909';
+        break;
+      case '\u0960':
+        s[i] = '\u090B';
+        break;
+      case '\u0961':
+        s[i] = '\u090C';
+        break;
+      case '\u0910':
+        s[i] = '\u090F';
+        break;
+      case '\u0914':
+        s[i] = '\u0913';
+        break;
+        // long -> short dep. vowels
+      case '\u0940':
+        s[i] = '\u093F';
+        break;
+      case '\u0942':
+        s[i] = '\u0941';
+        break;
+      case '\u0944':
+        s[i] = '\u0943';
+        break;
+      case '\u0963':
+        s[i] = '\u0962';
+        break;
+      case '\u0948':
+        s[i] = '\u0947';
+        break;
+      case '\u094C':
+        s[i] = '\u094B';
+        break;
+      default:
+        break;
+      }
+    }
+
+    return len;
+  }
+
+  /**
+   * Delete a character in-place
+   * 
+   * @param s Input Buffer
+   * @param pos Position of character to delete
+   * @param len length of input buffer
+   * @return length of input buffer after deletion
+   */
+  protected int delete(char s[], int pos, int len) {
+    if (pos < len)
+      System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
+    
+    return len - 1;
+  }
+}
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilter.java
@ -0,0 +1,49 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link HindiStemmer} to stem Hindi words.
+ */
+public final class HindiStemFilter extends TokenFilter {
+  private final TermAttribute termAtt = addAttribute(TermAttribute.class);
+  private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+  private final HindiStemmer stemmer = new HindiStemmer();
+  
+  protected HindiStemFilter(TokenStream input) {
+    super(input);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAtt.isKeyword())
+        termAtt.setTermLength(stemmer.stem(termAtt.termBuffer(), termAtt.termLength()));
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java
@ -0,0 +1,130 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Light Stemmer for Hindi.
+ * <p>
+ * Implements the algorithm specified in:
+ * <i>A Lightweight Stemmer for Hindi</i>
+ * Ananthakrishnan Ramanathan and Durgesh D Rao.
+ * http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf
+ * </p>
+ */
+public class HindiStemmer {
+  public int stem(char buffer[], int len) {
+    // 5
+    if ((len > 6) && (endsWith(buffer, len, "ाएंगी")
+        || endsWith(buffer, len, "ाएंगे")
+        || endsWith(buffer, len, "ाऊंगी")
+        || endsWith(buffer, len, "ाऊंगा")
+        || endsWith(buffer, len, "ाइयाँ")
+        || endsWith(buffer, len, "ाइयों")
+        || endsWith(buffer, len, "ाइयां")
+      ))
+      return len - 5;
+    
+    // 4
+    if ((len > 5) && (endsWith(buffer, len, "ाएगी")
+        || endsWith(buffer, len, "ाएगा")
+        || endsWith(buffer, len, "ाओगी")
+        || endsWith(buffer, len, "ाओगे")
+        || endsWith(buffer, len, "एंगी")
+        || endsWith(buffer, len, "ेंगी")
+        || endsWith(buffer, len, "एंगे")
+        || endsWith(buffer, len, "ेंगे")
+        || endsWith(buffer, len, "ूंगी")
+        || endsWith(buffer, len, "ूंगा")
+        || endsWith(buffer, len, "ातीं")
+        || endsWith(buffer, len, "नाओं")
+        || endsWith(buffer, len, "नाएं")
+        || endsWith(buffer, len, "ताओं")
+        || endsWith(buffer, len, "ताएं")
+        || endsWith(buffer, len, "ियाँ")
+        || endsWith(buffer, len, "ियों")
+        || endsWith(buffer, len, "ियां")
+        ))
+      return len - 4;
+    
+    // 3
+    if ((len > 4) && (endsWith(buffer, len, "ाकर")
+        || endsWith(buffer, len, "ाइए")
+        || endsWith(buffer, len, "ाईं")
+        || endsWith(buffer, len, "ाया")
+        || endsWith(buffer, len, "ेगी")
+        || endsWith(buffer, len, "ेगा")
+        || endsWith(buffer, len, "ोगी")
+        || endsWith(buffer, len, "ोगे")
+        || endsWith(buffer, len, "ाने")
+        || endsWith(buffer, len, "ाना")
+        || endsWith(buffer, len, "ाते")
+        || endsWith(buffer, len, "ाती")
+        || endsWith(buffer, len, "ाता")
+        || endsWith(buffer, len, "तीं")
+        || endsWith(buffer, len, "ाओं")
+        || endsWith(buffer, len, "ाएं")
+        || endsWith(buffer, len, "ुओं")
+        || endsWith(buffer, len, "ुएं")
+        || endsWith(buffer, len, "ुआं")
+        ))
+      return len - 3;
+    
+    // 2
+    if ((len > 3) && (endsWith(buffer, len, "कर")
+        || endsWith(buffer, len, "ाओ")
+        || endsWith(buffer, len, "िए")
+        || endsWith(buffer, len, "ाई")
+        || endsWith(buffer, len, "ाए")
+        || endsWith(buffer, len, "ने")
+        || endsWith(buffer, len, "नी")
+        || endsWith(buffer, len, "ना")
+        || endsWith(buffer, len, "ते")
+        || endsWith(buffer, len, "ीं")
+        || endsWith(buffer, len, "ती")
+        || endsWith(buffer, len, "ता")
+        || endsWith(buffer, len, "ाँ")
+        || endsWith(buffer, len, "ां")
+        || endsWith(buffer, len, "ों")
+        || endsWith(buffer, len, "ें")
+        ))
+      return len - 2;
+    
+    // 1
+    if ((len > 2) && (endsWith(buffer, len, "ो")
+        || endsWith(buffer, len, "े")
+        || endsWith(buffer, len, "ू")
+        || endsWith(buffer, len, "ु")
+        || endsWith(buffer, len, "ी")
+        || endsWith(buffer, len, "ि")
+        || endsWith(buffer, len, "ा")
+       ))
+      return len - 1;
+    return len;
+  }
+  
+  private boolean endsWith(final char s[], final int len, final String suffix) {
+    final int suffixLen = suffix.length();
+    if (suffixLen > len)
+      return false;
+    for (int i = suffixLen - 1; i >= 0; i--)
+      if (s[len -(suffixLen - i)] != suffix.charAt(i))
+        return false;
+    
+    return true;
+  }
+}
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/package.html
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/package.html
@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Hindi.
+</body>
+</html>
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilter.java
@ -0,0 +1,47 @@
+package org.apache.lucene.analysis.in;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link IndicNormalizer} to normalize text
+ * in Indian Languages.
+ */
+public final class IndicNormalizationFilter extends TokenFilter {
+  private final TermAttribute termAtt = addAttribute(TermAttribute.class);
+  private final IndicNormalizer normalizer = new IndicNormalizer();
+  
+  public IndicNormalizationFilter(TokenStream input) {
+    super(input);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      termAtt.setTermLength(normalizer.normalize(termAtt.termBuffer(), termAtt.termLength()));
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java
@ -0,0 +1,303 @@
+package org.apache.lucene.analysis.in;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.BitSet;
+import java.util.IdentityHashMap;
+import static java.lang.Character.UnicodeBlock.*;
+
+/**
+ * Normalizes the Unicode representation of text in Indian languages.
+ * <p>
+ * Follows guidelines from Unicode 5.2, chapter 6, South Asian Scripts I
+ * and graphical decompositions from http://ldc.upenn.edu/myl/IndianScriptsUnicode.html
+ * </p>
+ */
+public class IndicNormalizer {
+  
+  private static class ScriptData {
+    final int flag;
+    final int base;
+    BitSet decompMask;
+    
+    ScriptData(int flag, int base) {
+      this.flag = flag;
+      this.base = base;
+    }
+  }
+  
+  private static final IdentityHashMap<Character.UnicodeBlock,ScriptData> scripts = 
+    new IdentityHashMap<Character.UnicodeBlock,ScriptData>(9);
+  
+  private static int flag(Character.UnicodeBlock ub) {
+    return scripts.get(ub).flag;
+  }
+  
+  static {
+    scripts.put(DEVANAGARI, new ScriptData(1,   0x0900));
+    scripts.put(BENGALI,    new ScriptData(2,   0x0980));
+    scripts.put(GURMUKHI,   new ScriptData(4,   0x0A00));
+    scripts.put(GUJARATI,   new ScriptData(8,   0x0A80));
+    scripts.put(ORIYA,      new ScriptData(16,  0x0B00));
+    scripts.put(TAMIL,      new ScriptData(32,  0x0B80));
+    scripts.put(TELUGU,     new ScriptData(64,  0x0C00));
+    scripts.put(KANNADA,    new ScriptData(128, 0x0C80));
+    scripts.put(MALAYALAM,  new ScriptData(256, 0x0D00));
+  }
+
+  /**
+   * Decompositions according to Unicode 5.2, 
+   * and http://ldc.upenn.edu/myl/IndianScriptsUnicode.html
+   * 
+   * Most of these are not handled by unicode normalization anyway.
+   * 
+   * The numbers here represent offsets into the respective codepages,
+   * with -1 representing null and 0xFF representing zero-width joiner.
+   * 
+   * the columns are: ch1, ch2, ch3, res, flags
+   * ch1, ch2, and ch3 are the decomposition
+   * res is the composition, and flags are the scripts to which it applies.
+   */
+  private static final int decompositions[][] = {
+      /* devanagari, gujarati vowel candra O */
+      { 0x05, 0x3E, 0x45, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
+      /* devanagari short O */
+      { 0x05, 0x3E, 0x46, 0x12, flag(DEVANAGARI) }, 
+      /* devanagari, gujarati letter O */
+      { 0x05, 0x3E, 0x47, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
+      /* devanagari letter AI, gujarati letter AU */
+      { 0x05, 0x3E, 0x48, 0x14, flag(DEVANAGARI) | flag(GUJARATI) }, 
+      /* devanagari, bengali, gurmukhi, gujarati, oriya AA */
+      { 0x05, 0x3E,   -1, 0x06, flag(DEVANAGARI) | flag(BENGALI) | flag(GURMUKHI) | flag(GUJARATI) | flag(ORIYA) }, 
+      /* devanagari letter candra A */
+      { 0x05, 0x45,   -1, 0x72, flag(DEVANAGARI) },
+      /* gujarati vowel candra E */
+      { 0x05, 0x45,   -1, 0x0D, flag(GUJARATI) },
+      /* devanagari letter short A */
+      { 0x05, 0x46,   -1, 0x04, flag(DEVANAGARI) },
+      /* gujarati letter E */
+      { 0x05, 0x47,   -1, 0x0F, flag(GUJARATI) }, 
+      /* gurmukhi, gujarati letter AI */
+      { 0x05, 0x48,   -1, 0x10, flag(GURMUKHI) | flag(GUJARATI) }, 
+      /* devanagari, gujarati vowel candra O */
+      { 0x05, 0x49,   -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) }, 
+      /* devanagari short O */
+      { 0x05, 0x4A,   -1, 0x12, flag(DEVANAGARI) }, 
+      /* devanagari, gujarati letter O */
+      { 0x05, 0x4B,   -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) }, 
+      /* devanagari letter AI, gurmukhi letter AU, gujarati letter AU */
+      { 0x05, 0x4C,   -1, 0x14, flag(DEVANAGARI) | flag(GURMUKHI) | flag(GUJARATI) }, 
+      /* devanagari, gujarati vowel candra O */
+      { 0x06, 0x45,   -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },  
+      /* devanagari short O */
+      { 0x06, 0x46,   -1, 0x12, flag(DEVANAGARI) },
+      /* devanagari, gujarati letter O */
+      { 0x06, 0x47,   -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
+      /* devanagari letter AI, gujarati letter AU */
+      { 0x06, 0x48,   -1, 0x14, flag(DEVANAGARI) | flag(GUJARATI) },
+      /* malayalam letter II */
+      { 0x07, 0x57,   -1, 0x08, flag(MALAYALAM) },
+      /* devanagari letter UU */
+      { 0x09, 0x41,   -1, 0x0A, flag(DEVANAGARI) },
+      /* tamil, malayalam letter UU (some styles) */
+      { 0x09, 0x57,   -1, 0x0A, flag(TAMIL) | flag(MALAYALAM) },
+      /* malayalam letter AI */
+      { 0x0E, 0x46,   -1, 0x10, flag(MALAYALAM) },
+      /* devanagari candra E */
+      { 0x0F, 0x45,   -1, 0x0D, flag(DEVANAGARI) }, 
+      /* devanagari short E */
+      { 0x0F, 0x46,   -1, 0x0E, flag(DEVANAGARI) },
+      /* devanagari AI */
+      { 0x0F, 0x47,   -1, 0x10, flag(DEVANAGARI) },
+      /* oriya AI */
+      { 0x0F, 0x57,   -1, 0x10, flag(ORIYA) },
+      /* malayalam letter OO */
+      { 0x12, 0x3E,   -1, 0x13, flag(MALAYALAM) }, 
+      /* telugu, kannada letter AU */
+      { 0x12, 0x4C,   -1, 0x14, flag(TELUGU) | flag(KANNADA) }, 
+      /* telugu letter OO */
+      { 0x12, 0x55,   -1, 0x13, flag(TELUGU) },
+      /* tamil, malayalam letter AU */
+      { 0x12, 0x57,   -1, 0x14, flag(TAMIL) | flag(MALAYALAM) },
+      /* oriya letter AU */
+      { 0x13, 0x57,   -1, 0x14, flag(ORIYA) },
+      /* devanagari qa */
+      { 0x15, 0x3C,   -1, 0x58, flag(DEVANAGARI) },
+      /* devanagari, gurmukhi khha */
+      { 0x16, 0x3C,   -1, 0x59, flag(DEVANAGARI) | flag(GURMUKHI) },
+      /* devanagari, gurmukhi ghha */
+      { 0x17, 0x3C,   -1, 0x5A, flag(DEVANAGARI) | flag(GURMUKHI) },
+      /* devanagari, gurmukhi za */
+      { 0x1C, 0x3C,   -1, 0x5B, flag(DEVANAGARI) | flag(GURMUKHI) },
+      /* devanagari dddha, bengali, oriya rra */
+      { 0x21, 0x3C,   -1, 0x5C, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
+      /* devanagari, bengali, oriya rha */
+      { 0x22, 0x3C,   -1, 0x5D, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
+      /* malayalam chillu nn */
+      { 0x23, 0x4D, 0xFF, 0x7A, flag(MALAYALAM) },
+      /* bengali khanda ta */
+      { 0x24, 0x4D, 0xFF, 0x4E, flag(BENGALI) },
+      /* devanagari nnna */
+      { 0x28, 0x3C,   -1, 0x29, flag(DEVANAGARI) },
+      /* malayalam chillu n */
+      { 0x28, 0x4D, 0xFF, 0x7B, flag(MALAYALAM) },
+      /* devanagari, gurmukhi fa */
+      { 0x2B, 0x3C,   -1, 0x5E, flag(DEVANAGARI) | flag(GURMUKHI) },
+      /* devanagari, bengali yya */
+      { 0x2F, 0x3C,   -1, 0x5F, flag(DEVANAGARI) | flag(BENGALI) },
+      /* telugu letter vocalic R */
+      { 0x2C, 0x41, 0x41, 0x0B, flag(TELUGU) },
+      /* devanagari rra */
+      { 0x30, 0x3C,   -1, 0x31, flag(DEVANAGARI) },
+      /* malayalam chillu rr */
+      { 0x30, 0x4D, 0xFF, 0x7C, flag(MALAYALAM) },
+      /* malayalam chillu l */
+      { 0x32, 0x4D, 0xFF, 0x7D, flag(MALAYALAM) },
+      /* devanagari llla */
+      { 0x33, 0x3C,   -1, 0x34, flag(DEVANAGARI) },
+      /* malayalam chillu ll */
+      { 0x33, 0x4D, 0xFF, 0x7E, flag(MALAYALAM) },
+      /* telugu letter MA */ 
+      { 0x35, 0x41,   -1, 0x2E, flag(TELUGU) },
+      /* devanagari, gujarati vowel sign candra O */
+      { 0x3E, 0x45,   -1, 0x49, flag(DEVANAGARI) | flag(GUJARATI) },
+      /* devanagari vowel sign short O */
+      { 0x3E, 0x46,   -1, 0x4A, flag(DEVANAGARI) },
+      /* devanagari, gujarati vowel sign O */
+      { 0x3E, 0x47,   -1, 0x4B, flag(DEVANAGARI) | flag(GUJARATI) },
+      /* devanagari, gujarati vowel sign AU */ 
+      { 0x3E, 0x48,   -1, 0x4C, flag(DEVANAGARI) | flag(GUJARATI) },
+      /* kannada vowel sign II */ 
+      { 0x3F, 0x55,   -1, 0x40, flag(KANNADA) },
+      /* gurmukhi vowel sign UU (when stacking) */
+      { 0x41, 0x41,   -1, 0x42, flag(GURMUKHI) },
+      /* tamil, malayalam vowel sign O */
+      { 0x46, 0x3E,   -1, 0x4A, flag(TAMIL) | flag(MALAYALAM) },
+      /* kannada vowel sign OO */
+      { 0x46, 0x42, 0x55, 0x4B, flag(KANNADA) },
+      /* kannada vowel sign O */
+      { 0x46, 0x42,   -1, 0x4A, flag(KANNADA) },
+      /* malayalam vowel sign AI (if reordered twice) */
+      { 0x46, 0x46,   -1, 0x48, flag(MALAYALAM) },
+      /* telugu, kannada vowel sign EE */
+      { 0x46, 0x55,   -1, 0x47, flag(TELUGU) | flag(KANNADA) },
+      /* telugu, kannada vowel sign AI */
+      { 0x46, 0x56,   -1, 0x48, flag(TELUGU) | flag(KANNADA) },
+      /* tamil, malayalam vowel sign AU */
+      { 0x46, 0x57,   -1, 0x4C, flag(TAMIL) | flag(MALAYALAM) },
+      /* bengali, oriya vowel sign O, tamil, malayalam vowel sign OO */
+      { 0x47, 0x3E,   -1, 0x4B, flag(BENGALI) | flag(ORIYA) | flag(TAMIL) | flag(MALAYALAM) },
+      /* bengali, oriya vowel sign AU */
+      { 0x47, 0x57,   -1, 0x4C, flag(BENGALI) | flag(ORIYA) },
+      /* kannada vowel sign OO */   
+      { 0x4A, 0x55,   -1, 0x4B, flag(KANNADA) },
+      /* gurmukhi letter I */
+      { 0x72, 0x3F,   -1, 0x07, flag(GURMUKHI) },
+      /* gurmukhi letter II */
+      { 0x72, 0x40,   -1, 0x08, flag(GURMUKHI) },
+      /* gurmukhi letter EE */
+      { 0x72, 0x47,   -1, 0x0F, flag(GURMUKHI) },
+      /* gurmukhi letter U */
+      { 0x73, 0x41,   -1, 0x09, flag(GURMUKHI) },
+      /* gurmukhi letter UU */
+      { 0x73, 0x42,   -1, 0x0A, flag(GURMUKHI) },
+      /* gurmukhi letter OO */
+      { 0x73, 0x4B,   -1, 0x13, flag(GURMUKHI) },
+  };
+  
+  static {
+    for (ScriptData sd : scripts.values()) {
+      sd.decompMask = new BitSet(0x7F);
+      for (int i = 0; i < decompositions.length; i++) {
+        final int ch = decompositions[i][0];
+        final int flags = decompositions[i][4];
+        if ((flags & sd.flag) != 0)
+          sd.decompMask.set(ch);
+      }
+    }
+  }
+   
+  /**
+   * Normalizes input text, and returns the new length.
+   * The length will always be less than or equal to the existing length.
+   * 
+   * @param text input text
+   * @param len valid length
+   * @return normalized length
+   */
+  public int normalize(char text[], int len) {
+    for (int i = 0; i < len; i++) {
+      final Character.UnicodeBlock block = Character.UnicodeBlock.of(text[i]);
+      final ScriptData sd = scripts.get(block);
+      if (sd != null) {
+        final int ch = text[i] - sd.base;
+        if (sd.decompMask.get(ch))
+          len = compose(ch, block, sd, text, i, len);
+      }
+    }
+    return len;
+  }
+  
+  /**
+   * Compose into standard form any compositions in the decompositions table.
+   */
+  private int compose(int ch0, Character.UnicodeBlock block0, ScriptData sd, 
+      char text[], int pos, int len) {
+    if (pos + 1 >= len) /* need at least 2 chars! */
+      return len;
+    
+    final int ch1 = text[pos + 1] - sd.base;
+    final Character.UnicodeBlock block1 = Character.UnicodeBlock.of(text[pos + 1]);
+    if (block1 != block0) /* needs to be the same writing system */
+      return len;
+    
+    int ch2 = -1;
+
+    if (pos + 2 < len) {
+      ch2 = text[pos + 2] - sd.base;
+      Character.UnicodeBlock block2 = Character.UnicodeBlock.of(text[pos + 2]);
+      if (text[pos + 2] == '\u200D') // ZWJ
+        ch2 = 0xFF;
+      else if (block2 != block1)  // still allow a 2-char match
+        ch2 = -1;
+    }
+
+    for (int i = 0; i < decompositions.length; i++)
+      if (decompositions[i][0] == ch0 && (decompositions[i][4] & sd.flag) != 0) {
+        if (decompositions[i][1] == ch1 && (decompositions[i][2] < 0 || decompositions[i][2] == ch2)) {
+          text[pos] = (char) (sd.base + decompositions[i][3]);
+          len = delete(text, pos + 1, len);
+          if (decompositions[i][2] >= 0)
+            len = delete(text, pos + 1, len);
+          return len;
+        }
+      }
+    
+    return len;
+  }
+  
+  /**
+   * Delete a character in-place
+   */
+  private int delete(char s[], int pos, int len) {
+    if (pos < len) 
+      System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
+    
+    return len - 1;
+  }
+}
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java
@ -0,0 +1,50 @@
+package org.apache.lucene.analysis.in;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.CharTokenizer;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.Version;
+
+/**
+ * Simple Tokenizer for text in Indian Languages.
+ */
+public final class IndicTokenizer extends CharTokenizer {
+ 
+  public IndicTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
+    super(matchVersion, factory, input);
+  }
+
+  public IndicTokenizer(Version matchVersion, AttributeSource source, Reader input) {
+    super(matchVersion, source, input);
+  }
+
+  public IndicTokenizer(Version matchVersion, Reader input) {
+    super(matchVersion, input);
+  }
+
+  @Override
+  protected boolean isTokenChar(int c) {
+    return Character.isLetter(c)
+    || Character.getType(c) == Character.NON_SPACING_MARK
+    || Character.getType(c) == Character.FORMAT
+    || Character.getType(c) == Character.COMBINING_SPACING_MARK;
+  }
+}
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/package.html
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/in/package.html
@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analysis components for Indian languages.
+</body>
+</html>
--- a/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+++ b/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
@ -0,0 +1,231 @@
+# Also see http://www.opensource.org/licenses/bsd-license.html
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# This file was created by Jacques Savoy and is distributed under the BSD license.
+अंदर
+अत
+अपना
+अपनी
+अपने
+अभी
+आदि
+आप
+इत्यादि
+इन 
+इनका
+इन्हीं
+इन्हें
+इन्हों
+इस
+इसका
+इसकी
+इसके
+इसमें
+इसी
+इसे
+उन
+उनका
+उनकी
+उनके
+उनको
+उन्हीं
+उन्हें
+उन्हों
+उस
+उसके
+उसी
+उसे
+एक
+एवं
+एस
+ऐसे
+और
+कई
+कर
+करता
+करते
+करना
+करने
+करें
+कहते
+कहा
+का
+काफ़ी
+कि
+कितना
+किन्हें
+किन्हों
+किया
+किर
+किस
+किसी
+किसे
+की
+कुछ
+कुल
+के
+को
+कोई
+कौन
+कौनसा
+गया
+घर
+जब
+जहाँ
+जा
+जितना
+जिन
+जिन्हें
+जिन्हों
+जिस
+जिसे
+जीधर
+जैसा
+जैसे
+जो
+तक
+तब
+तरह
+तिन
+तिन्हें
+तिन्हों
+तिस
+तिसे
+तो
+था
+थी
+थे
+दबारा
+दिया
+दुसरा
+दूसरे
+दो
+द्वारा
+न
+नहीं
+ना
+निहायत
+नीचे
+ने
+पर
+पर  
+पहले
+पूरा
+पे
+फिर
+बनी
+बही
+बहुत
+बाद
+बाला
+बिलकुल
+भी
+भीतर
+मगर
+मानो
+मे
+में
+यदि
+यह
+यहाँ
+यही
+या
+यिह 
+ये
+रखें
+रहा
+रहे
+ऱ्वासा
+लिए
+लिये
+लेकिन
+व
+वर्ग
+वह
+वह 
+वहाँ
+वहीं
+वाले
+वुह 
+वे
+वग़ैरह
+संग
+सकता
+सकते
+सबसे
+सभी
+साथ
+साबुत
+साभ
+सारा
+से
+सो
+ही
+हुआ
+हुई
+हुए
+है
+हैं
+हो
+होता
+होती
+होते
+होना
+होने
+# additional normalized forms of the above
+अपनि
+जेसे
+होति
+सभि
+तिंहों
+इंहों
+दवारा
+इसि
+किंहें
+थि
+उंहों
+ओर
+जिंहें
+वहिं
+अभि
+बनि
+हि
+उंहिं
+उंहें
+हें
+वगेरह
+एसे
+रवासा
+कोन
+निचे
+काफि
+उसि
+पुरा
+भितर
+हे
+बहि
+वहां
+कोइ
+यहां
+जिंहों
+तिंहें
+किसि
+कइ
+यहि
+इंहिं
+जिधर
+इंहें
+अदि
+इतयादि
+हुइ
+कोनसा
+इसकि
+दुसरे
+जहां
+अप
+किंहों
+उनकि
+भि
+वरग
+हुअ
+जेसा
+नहिं
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java
@ -0,0 +1,51 @@
+package org.apache.lucene.analysis.hi;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests the HindiAnalyzer
+ */
+public class TestHindiAnalyzer extends BaseTokenStreamTestCase {
+  /** This test fails with NPE when the 
+   * stopwords file is missing in classpath */
+  public void testResourcesAvailable() {
+    new HindiAnalyzer(Version.LUCENE_CURRENT);
+  }
+  
+  public void testBasics() throws Exception {
+    Analyzer a = new HindiAnalyzer(Version.LUCENE_CURRENT);
+    // two ways to write 'hindi' itself.
+    checkOneTermReuse(a, "हिन्दी", "हिंद");
+    checkOneTermReuse(a, "हिंदी", "हिंद");
+  }
+  
+  public void testExclusionSet() throws Exception {
+    Set<String> exclusionSet = new HashSet<String>();
+    exclusionSet.add("हिंदी");
+    Analyzer a = new HindiAnalyzer(Version.LUCENE_CURRENT, 
+        HindiAnalyzer.getDefaultStopSet(), exclusionSet);
+    checkOneTermReuse(a, "हिंदी", "हिंदी");
+  }
+}
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiNormalizer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiNormalizer.java
@ -0,0 +1,68 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test HindiNormalizer
+ */
+public class TestHindiNormalizer extends BaseTokenStreamTestCase {
+  /**
+   * Test some basic normalization, with an example from the paper.
+   */
+  public void testBasics() throws IOException {
+    check("अँगरेज़ी", "अंगरेजि");
+    check("अँगरेजी", "अंगरेजि");
+    check("अँग्रेज़ी", "अंगरेजि");
+    check("अँग्रेजी", "अंगरेजि");
+    check("अंगरेज़ी", "अंगरेजि");
+    check("अंगरेजी", "अंगरेजि");
+    check("अंग्रेज़ी", "अंगरेजि");
+    check("अंग्रेजी", "अंगरेजि");
+  }
+  
+  public void testDecompositions() throws IOException {
+    // removing nukta dot
+    check("क़िताब", "किताब");
+    check("फ़र्ज़", "फरज");
+    check("क़र्ज़", "करज");
+    // some other composed nukta forms
+    check("ऱऴख़ग़ड़ढ़य़", "रळखगडढय");
+    // removal of format (ZWJ/ZWNJ)
+    check("शार्‍मा", "शारमा");
+    check("शार्‌मा", "शारमा");
+    // removal of chandra
+    check("ॅॆॉॊऍऎऑऒ\u0972", "ेेोोएएओओअ");
+    // vowel shortening
+    check("आईऊॠॡऐऔीूॄॣैौ", "अइउऋऌएओिुृॢेो");
+  }
+  private void check(String input, String output) throws IOException {
+    Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, 
+        new StringReader(input));
+    TokenFilter tf = new HindiNormalizationFilter(tokenizer);
+    assertTokenStreamContents(tf, new String[] { output });
+  }
+}
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java
@ -0,0 +1,90 @@
+package org.apache.lucene.analysis.hi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test HindiStemmer
+ */
+public class TestHindiStemmer extends BaseTokenStreamTestCase {
+  /**
+   * Test masc noun inflections
+   */
+  public void testMasculineNouns() throws IOException {
+    check("लडका", "लडक");
+    check("लडके", "लडक");
+    check("लडकों", "लडक");
+    
+    check("गुरु", "गुर");
+    check("गुरुओं", "गुर");
+    
+    check("दोस्त", "दोस्त");
+    check("दोस्तों", "दोस्त");
+  }
+  
+  /**
+   * Test feminine noun inflections
+   */
+  public void testFeminineNouns() throws IOException {
+    check("लडकी", "लडक");
+    check("लडकियों", "लडक");
+    
+    check("किताब", "किताब");
+    check("किताबें", "किताब");
+    check("किताबों", "किताब");
+    
+    check("आध्यापीका", "आध्यापीक");
+    check("आध्यापीकाएं", "आध्यापीक");
+    check("आध्यापीकाओं", "आध्यापीक");
+  }
+  
+  /**
+   * Test some verb forms
+   */
+  public void testVerbs() throws IOException {
+    check("खाना", "खा");
+    check("खाता", "खा");
+    check("खाती", "खा");
+    check("खा", "खा");
+  }
+  
+  /**
+   * From the paper: since the suffix list for verbs includes AI, awA and anI,
+   * additional suffixes had to be added to the list for noun/adjectives
+   * ending with these endings.
+   */
+  public void testExceptions() throws IOException {
+    check("कठिनाइयां", "कठिन");
+    check("कठिन", "कठिन");
+  }
+  
+  private void check(String input, String output) throws IOException {
+    Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, 
+        new StringReader(input));
+    TokenFilter tf = new HindiStemFilter(tokenizer);
+    assertTokenStreamContents(tf, new String[] { output });
+  }
+}
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java
@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.in;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test IndicNormalizer
+ */
+public class TestIndicNormalizer extends BaseTokenStreamTestCase {
+  /**
+   * Test some basic normalization
+   */
+  public void testBasics() throws IOException {
+    check("अाॅअाॅ", "ऑऑ");
+    check("अाॆअाॆ", "ऒऒ");
+    check("अाेअाे", "ओओ");
+    check("अाैअाै", "औऔ");
+    check("अाअा", "आआ");
+    check("अाैर", "और");
+    // khanda-ta
+    check("ত্‍", "ৎ");
+  }
+  
+  private void check(String input, String output) throws IOException {
+    Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, 
+        new StringReader(input));
+    TokenFilter tf = new IndicNormalizationFilter(tokenizer);
+    assertTokenStreamContents(tf, new String[] { output });
+  }
+}
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java
@ -0,0 +1,45 @@
+package org.apache.lucene.analysis.in;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test IndicTokenizer
+ */
+public class TestIndicTokenizer extends BaseTokenStreamTestCase {
+  /** Test tokenizing Indic vowels, signs, and punctuation */
+  public void testBasics() throws IOException {
+    TokenStream ts = new IndicTokenizer(Version.LUCENE_CURRENT,
+        new StringReader("मुझे हिंदी का और अभ्यास करना होगा ।"));
+    assertTokenStreamContents(ts,
+        new String[] { "मुझे", "हिंदी", "का", "और", "अभ्यास", "करना", "होगा" });
+  }
+  
+  /** Test that words with format chars such as ZWJ are kept */
+  public void testFormat() throws Exception {
+    TokenStream ts = new IndicTokenizer(Version.LUCENE_CURRENT,
+        new StringReader("शार्‍मा शार्‍मा"));
+    assertTokenStreamContents(ts, new String[] { "शार्‍मा", "शार्‍मा" });
+  }
+}