LUCENE-5379: Kurdish Analyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1555359 13f79535-47bb-0310-9956-ffa450edef68
2014-01-04 16:05:50 +00:00 · 2014-01-04 16:05:50 +00:00 · 2140f4368a
parent 9d0b60388d
commit 2140f4368a
20 changed files with 1239 additions and 1 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -81,6 +81,8 @@ New Features
  matter in practice if the number of ranges is over 10 or so.  (Mike
  McCandless)
 * LUCENE-5379: Add Analyzer for Kurdish.  (Robert Muir)
 Build
 * LUCENE-5217: Maven config: get dependencies from Ant+Ivy config; disable
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
@ -0,0 +1,130 @@
 package org.apache.lucene.analysis.ckb;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.io.Reader;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
 import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 /**
 * {@link Analyzer} for Sorani Kurdish.
 */
 public final class SoraniAnalyzer extends StopwordAnalyzerBase {
  private final CharArraySet stemExclusionSet;
  /** File containing default Kurdish stopwords. */
  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
  /**
   * Returns an unmodifiable instance of the default stop words set.
   * @return default stop words set.
   */
  public static CharArraySet getDefaultStopSet() {
    return DefaultSetHolder.DEFAULT_STOP_SET;
  }
  /**
   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
   * accesses the static final set the first time.;
   */
  private static class DefaultSetHolder {
    static final CharArraySet DEFAULT_STOP_SET;
    static {
      try {
        DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(SoraniAnalyzer.class, 
            DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
      } catch (IOException ex) {
        // default set should always be present as it is part of the
        // distribution (JAR)
        throw new RuntimeException("Unable to load default stopword set");
      }
    }
  }
  /**
   * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
   */
  public SoraniAnalyzer(Version matchVersion) {
    this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
  }
  /**
   * Builds an analyzer with the given stop words.
   * 
   * @param matchVersion lucene compatibility version
   * @param stopwords a stopword set
   */
  public SoraniAnalyzer(Version matchVersion, CharArraySet stopwords) {
    this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
  }
  /**
   * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
   * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
   * stemming.
   * 
   * @param matchVersion lucene compatibility version
   * @param stopwords a stopword set
   * @param stemExclusionSet a set of terms not to be stemmed
   */
  public SoraniAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
    super(matchVersion, stopwords);
    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
        matchVersion, stemExclusionSet));
  }
  /**
   * Creates a
   * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   * which tokenizes all the text in the provided {@link Reader}.
   * 
   * @return A
   *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   *         built from an {@link StandardTokenizer} filtered with
   *         {@link StandardFilter}, {@link SoraniNormalizationFilter}, 
   *         {@link LowerCaseFilter}, {@link StopFilter}
   *         , {@link SetKeywordMarkerFilter} if a stem exclusion set is
   *         provided and {@link SoraniStemFilter}.
   */
  @Override
  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    result = new SoraniNormalizationFilter(result);
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter(matchVersion, result, stopwords);
    if(!stemExclusionSet.isEmpty())
      result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    result = new SoraniStemFilter(result);
    return new TokenStreamComponents(source, result);
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizationFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizationFilter.java
@ -0,0 +1,47 @@
 package org.apache.lucene.analysis.ckb;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 /**
 * A {@link TokenFilter} that applies {@link SoraniNormalizer} to normalize the
 * orthography.
 */
 public final class SoraniNormalizationFilter extends TokenFilter {
  private final SoraniNormalizer normalizer = new SoraniNormalizer();
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  public SoraniNormalizationFilter(TokenStream input) {
    super(input);
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
      final int newlen = normalizer.normalize(termAtt.buffer(), termAtt.length());
      termAtt.setLength(newlen);
      return true;
    } 
    return false;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizationFilterFactory.java
@ -0,0 +1,56 @@
 package org.apache.lucene.analysis.ckb;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.util.Map;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
 import org.apache.lucene.analysis.util.MultiTermAwareComponent;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 /**
 * Factory for {@link SoraniNormalizationFilter}.
 * <pre class="prettyprint">
 * &lt;fieldType name="text_ckbnormal" class="solr.TextField" positionIncrementGap="100"&gt;
 *   &lt;analyzer&gt;
 *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
 *     &lt;filter class="solr.SoraniNormalizationFilterFactory"/&gt;
 *   &lt;/analyzer&gt;
 * &lt;/fieldType&gt;</pre>
 */
 public class SoraniNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
  /** Creates a new SoraniNormalizationFilterFactory */
  public SoraniNormalizationFilterFactory(Map<String,String> args) {
    super(args);
    if (!args.isEmpty()) {
      throw new IllegalArgumentException("Unknown parameters: " + args);
    }
  }
  @Override
  public SoraniNormalizationFilter create(TokenStream input) {
    return new SoraniNormalizationFilter(input);
  }
  @Override
  public AbstractAnalysisFactory getMultiTermComponent() {
    return this;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizer.java
@ -0,0 +1,127 @@
 package org.apache.lucene.analysis.ckb;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import static org.apache.lucene.analysis.util.StemmerUtil.delete;
 /** 
 * Normalizes the Unicode representation of Sorani text.
 * <p>
 * Normalization consists of:
 * <ul>
 *   <li>Alternate forms of 'y' (0064, 0649) are converted to 06CC (FARSI YEH)
 *   <li>Alternate form of 'k' (0643) is converted to 06A9 (KEHEH)
 *   <li>Alternate forms of vowel 'e' (0647+200C, word-final 0647, 0629) are converted to 06D5 (AE)
 *   <li>Alternate (joining) form of 'h' (06BE) is converted to 0647
 *   <li>Alternate forms of 'rr' (0692, word-initial 0631) are converted to 0695 (REH WITH SMALL V BELOW)
 *   <li>Harakat, tatweel, and formatting characters such as directional controls are removed.
 * </ul>
 */
 public class SoraniNormalizer {
  static final char YEH = '\u064A';
  static final char DOTLESS_YEH = '\u0649';
  static final char FARSI_YEH = '\u06CC';
  static final char KAF = '\u0643';
  static final char KEHEH = '\u06A9';
  static final char HEH = '\u0647';
  static final char AE = '\u06D5';
  static final char ZWNJ = '\u200C';
  static final char HEH_DOACHASHMEE = '\u06BE';
  static final char TEH_MARBUTA = '\u0629';
  static final char REH = '\u0631';
  static final char RREH = '\u0695';
  static final char RREH_ABOVE = '\u0692';
  static final char TATWEEL = '\u0640';
  static final char FATHATAN = '\u064B';
  static final char DAMMATAN = '\u064C';
  static final char KASRATAN = '\u064D';
  static final char FATHA = '\u064E';
  static final char DAMMA = '\u064F';
  static final char KASRA = '\u0650';
  static final char SHADDA = '\u0651';
  static final char SUKUN = '\u0652';
  /**
   * Normalize an input buffer of Sorani text
   * 
   * @param s input buffer
   * @param len length of input buffer
   * @return length of input buffer after normalization
   */
  public int normalize(char s[], int len) {
    for (int i = 0; i < len; i++) {
      switch (s[i]) {
        case YEH:
        case DOTLESS_YEH:
          s[i] = FARSI_YEH;
          break;
        case KAF:
          s[i] = KEHEH;
          break;
        case ZWNJ:
          if (i > 0 && s[i-1] == HEH) {
            s[i-1] = AE;
          }
          len = delete(s, i, len);
          i--;
          break;
        case HEH:
          if (i == len-1) {
            s[i] = AE;
          }
          break;
        case TEH_MARBUTA:
          s[i] = AE;
          break;
        case HEH_DOACHASHMEE:
          s[i] = HEH;
          break;
        case REH:
          if (i == 0) {
            s[i] = RREH;
          }
          break;
        case RREH_ABOVE:
          s[i] = RREH;
          break;
        case TATWEEL:
        case KASRATAN:
        case DAMMATAN:
        case FATHATAN:
        case FATHA:
        case DAMMA:
        case KASRA:
        case SHADDA:
        case SUKUN:
          len = delete(s, i, len);
          i--;
          break;
        default:
          if (Character.getType(s[i]) == Character.FORMAT) {
            len = delete(s, i, len);
            i--;
          }
      }
    }
    return len;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemFilter.java
@ -0,0 +1,58 @@
 package org.apache.lucene.analysis.ckb;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // javadoc @link
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 /**
 * A {@link TokenFilter} that applies {@link SoraniStemmer} to stem Sorani words.
 * <p>
 * To prevent terms from being stemmed use an instance of
 * {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
 * the {@link KeywordAttribute} before this {@link TokenStream}.
 * </p>
 * @see SetKeywordMarkerFilter */
 public final class SoraniStemFilter extends TokenFilter {
  private final SoraniStemmer stemmer = new SoraniStemmer();
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
  public SoraniStemFilter(TokenStream input) {
    super(input);
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
      if(!keywordAttr.isKeyword()) {
        final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
        termAtt.setLength(newlen);
      }
      return true;
    } else {
      return false;
    }
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemFilterFactory.java
@ -0,0 +1,50 @@
 package org.apache.lucene.analysis.ckb;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.util.Map;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 /**
 * Factory for {@link SoraniStemFilter}.
 * <pre class="prettyprint">
 * &lt;fieldType name="text_ckbstem" class="solr.TextField" positionIncrementGap="100"&gt;
 *   &lt;analyzer&gt;
 *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
 *     &lt;filter class="solr.SoraniNormalizationFilterFactory"/&gt;
 *     &lt;filter class="solr.SoraniStemFilterFactory"/&gt;
 *   &lt;/analyzer&gt;
 * &lt;/fieldType&gt;</pre>
 */
 public class SoraniStemFilterFactory extends TokenFilterFactory {
  /** Creates a new SoraniStemFilterFactory */
  public SoraniStemFilterFactory(Map<String,String> args) {
    super(args);
    if (!args.isEmpty()) {
      throw new IllegalArgumentException("Unknown parameters: " + args);
    }
  }
  @Override
  public SoraniStemFilter create(TokenStream input) {
    return new SoraniStemFilter(input);
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemmer.java
@ -0,0 +1,103 @@
 package org.apache.lucene.analysis.ckb;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
 /**
 * Light stemmer for Sorani
 */
 public class SoraniStemmer {
  /**
   * Stem an input buffer of Sorani text.
   * 
   * @param s input buffer
   * @param len length of input buffer
   * @return length of input buffer after normalization
   */
  public int stem(char s[], int len) {
    // postposition
    if (len > 5 && endsWith(s, len, "دا")) {
      len -= 2;
    } else if (len > 4 && endsWith(s, len, "نا")) {
      len--;
    } else if (len > 6 && endsWith(s, len, "ەوە")) {
      len -= 3;
    }
    // possessive pronoun
    if (len > 6 && (endsWith(s, len, "مان") || endsWith(s, len, "یان") || endsWith(s, len, "تان"))) {
      len -= 3;
    }
    // indefinite singular ezafe
    if (len > 6 && endsWith(s, len, "ێکی")) {
      return len-3;
    } else if (len > 7 && endsWith(s, len, "یەکی")) {
      return len-4;
    }
    // indefinite singular
    if (len > 5 && endsWith(s, len, "ێک")) {
      return len-2;
    } else if (len > 6 && endsWith(s, len, "یەک")) {
      return len-3;
    }
    // definite singular
    else if (len > 6 && endsWith(s, len, "ەکە")) {
      return len-3;
    } else if (len > 5 && endsWith(s, len, "کە")) {
      return len-2;
    }
    // definite plural
    else if (len > 7 && endsWith(s, len, "ەکان")) {
      return len-4;
    } else if (len > 6 && endsWith(s, len, "کان")) {
      return len-3;
    }
    // indefinite plural ezafe
    else if (len > 7 && endsWith(s, len, "یانی")) {
      return len-4;
    } else if (len > 6 && endsWith(s, len, "انی")) {
      return len-3;
    }
    // indefinite plural
    else if (len > 6 && endsWith(s, len, "یان")) {
      return len-3;
    } else if (len > 5 && endsWith(s, len, "ان")) {
      return len-2;
    } 
    // demonstrative plural
    else if (len > 7 && endsWith(s, len, "یانە")) {
      return len-4;
    } else if (len > 6 && endsWith(s, len, "انە")) {
      return len-3;
    }
    // demonstrative singular
    else if (len > 5 && (endsWith(s, len, "ایە") || endsWith(s, len, "ەیە"))) {
      return len-2;
    } else if (len > 4 && endsWith(s, len, "ە")) {
      return len-1;
    }
    // absolute singular ezafe
    else if (len > 4 && endsWith(s, len, "ی")) {
      return len-1;
    }
    return len;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/package.html
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/package.html
@ -0,0 +1,22 @@
 <!doctype html public "-//w3c//dtd html 4.0 transitional//en">
 <!--
 Licensed to the Apache Software Foundation (ASF) under one or more
 contributor license agreements.  See the NOTICE file distributed with
 this work for additional information regarding copyright ownership.
 The ASF licenses this file to You under the Apache License, Version 2.0
 (the "License"); you may not use this file except in compliance with
 the License.  You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
 <html><head></head>
 <body>
 Analyzer for Sorani Kurdish.
 </body>
 </html>
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@ -19,6 +19,8 @@ org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
 org.apache.lucene.analysis.br.BrazilianStemFilterFactory
 org.apache.lucene.analysis.cjk.CJKBigramFilterFactory
 org.apache.lucene.analysis.cjk.CJKWidthFilterFactory
 org.apache.lucene.analysis.ckb.SoraniNormalizationFilterFactory
 org.apache.lucene.analysis.ckb.SoraniStemFilterFactory
 org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory
 org.apache.lucene.analysis.commongrams.CommonGramsQueryFilterFactory
 org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilterFactory
--- a/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ckb/stopwords.txt
+++ b/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ckb/stopwords.txt
@ -0,0 +1,136 @@
 # set of kurdish stopwords
 # note these have been normalized with our scheme (e represented with U+06D5, etc)
 # constructed from:
 # * Fig 5 of "Building A Test Collection For Sorani Kurdish" (Esmaili et al)
 # * "Sorani Kurdish: A Reference Grammar with selected readings" (Thackston)
 # * Corpus-based analysis of 77M word Sorani collection: wikipedia, news, blogs, etc
 # and
 و
 # which
 کە
 # of
 ی
 # made/did
 کرد
 # that/which
 ئەوەی
 # on/head
 سەر
 # two
 دوو
 # also
 هەروەها
 # from/that
 لەو
 # makes/does
 دەکات
 # some
 چەند
 # every
 هەر
 # demonstratives
 # that
 ئەو
 # this
 ئەم
 # personal pronouns
 # I
 من
 # we
 ئێمە
 # you
 تۆ
 # you
 ئێوە
 # he/she/it
 ئەو
 # they
 ئەوان
 # prepositions
 # to/with/by
 بە
 پێ
 # without
 بەبێ
 # along with/while/during
 بەدەم
 # in the opinion of
 بەلای
 # according to
 بەپێی
 # before
 بەرلە
 # in the direction of
 بەرەوی
 # in front of/toward
 بەرەوە
 # before/in the face of
 بەردەم
 # without
 بێ
 # except for
 بێجگە
 # for
 بۆ
 # on/in
 دە
 تێ
 # with
 دەگەڵ
 # after
 دوای
 # except for/aside from
 جگە
 # in/from
 لە
 لێ
 # in front of/before/because of
 لەبەر
 # between/among
 لەبەینی
 # concerning/about
 لەبابەت
 # concerning
 لەبارەی
 # instead of
 لەباتی
 # beside
 لەبن
 # instead of
 لەبرێتی
 # behind
 لەدەم
 # with/together with
 لەگەڵ
 # by
 لەلایەن
 # within
 لەناو
 # between/among
 لەنێو
 # for the sake of
 لەپێناوی
 # with respect to
 لەرەوی
 # by means of/for
 لەرێ
 # for the sake of
 لەرێگا
 # on/on top of/according to
 لەسەر
 # under
 لەژێر
 # between/among
 ناو
 # between/among
 نێوان
 # after
 پاش
 # before
 پێش
 # like
 وەک
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java
@ -0,0 +1,66 @@
 package org.apache.lucene.analysis.ckb;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.util.CharArraySet;
 /**
 * Test the Sorani analyzer
 */
 public class TestSoraniAnalyzer extends BaseTokenStreamTestCase {
  /**
   * This test fails with NPE when the stopwords file is missing in classpath
   */
  public void testResourcesAvailable() {
    new SoraniAnalyzer(TEST_VERSION_CURRENT);
  }
  public void testStopwords() throws IOException {
    Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT);
    assertAnalyzesTo(a, "ئەم پیاوە", new String[] {"پیاو"});
  }
  public void testCustomStopwords() throws IOException {
    Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
    assertAnalyzesTo(a, "ئەم پیاوە", 
        new String[] {"ئەم", "پیاو"});
  }
  public void testReusableTokenStream() throws IOException {
    Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT);
    assertAnalyzesTo(a, "پیاوە", new String[] {"پیاو"});
    assertAnalyzesTo(a, "پیاو", new String[] {"پیاو"});
  }
  public void testWithStemExclusionSet() throws IOException {
    CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
    set.add("پیاوە");
    Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
    assertAnalyzesTo(a, "پیاوە", new String[] { "پیاوە" });
  }
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
    checkRandomData(random(), new SoraniAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniNormalizationFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniNormalizationFilter.java
@ -0,0 +1,92 @@
 package org.apache.lucene.analysis.ckb;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.io.Reader;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 /**
 * Tests normalization for Sorani (this is more critical than stemming...)
 */
 public class TestSoraniNormalizationFilter extends BaseTokenStreamTestCase {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new KeywordTokenizer(reader);
      return new TokenStreamComponents(tokenizer, new SoraniNormalizationFilter(tokenizer));
    }
  };
  public void testY() throws Exception {
    checkOneTerm(a, "\u064A", "\u06CC");
    checkOneTerm(a, "\u0649", "\u06CC");
    checkOneTerm(a, "\u06CC", "\u06CC");
  }
  public void testK() throws Exception {
    checkOneTerm(a, "\u0643", "\u06A9");
    checkOneTerm(a, "\u06A9", "\u06A9");
  }
  public void testH() throws Exception {
    // initial
    checkOneTerm(a, "\u0647\u200C", "\u06D5");
    // medial
    checkOneTerm(a, "\u0647\u200C\u06A9", "\u06D5\u06A9");
    checkOneTerm(a, "\u06BE", "\u0647");
    checkOneTerm(a, "\u0629", "\u06D5");
  }
  public void testFinalH() throws Exception {
    // always (and in final form by def), so frequently omitted
    checkOneTerm(a, "\u0647\u0647\u0647", "\u0647\u0647\u06D5");
  }
  public void testRR() throws Exception {
    checkOneTerm(a, "\u0692", "\u0695");
  }
  public void testInitialRR() throws Exception {
    // always, so frequently omitted
    checkOneTerm(a, "\u0631\u0631\u0631", "\u0695\u0631\u0631");
  }
  public void testRemove() throws Exception {
    checkOneTerm(a, "\u0640", "");
    checkOneTerm(a, "\u064B", "");
    checkOneTerm(a, "\u064C", "");
    checkOneTerm(a, "\u064D", "");
    checkOneTerm(a, "\u064E", "");
    checkOneTerm(a, "\u064F", "");
    checkOneTerm(a, "\u0650", "");
    checkOneTerm(a, "\u0651", "");
    checkOneTerm(a, "\u0652", "");
    // we peek backwards in this case to look for h+200C, ensure this works
    checkOneTerm(a, "\u200C", "");
  }
  public void testEmptyTerm() throws IOException {
    checkOneTerm(a, "", "");
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniNormalizationFilterFactory.java
@ -0,0 +1,48 @@
 package org.apache.lucene.analysis.ckb;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.Reader;
 import java.io.StringReader;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
 /**
 * Simple tests to ensure the Sorani normalization factory is working.
 */
 public class TestSoraniNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
  public void testNormalization() throws Exception {
    Reader reader = new StringReader("پیــــاوەکان");
    TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
    stream = tokenFilterFactory("SoraniNormalization").create(stream);
    assertTokenStreamContents(stream, new String[] { "پیاوەکان" });
  }
  /** Test that bogus arguments result in exception */
  public void testBogusArguments() throws Exception {
    try {
      tokenFilterFactory("SoraniNormalization", "bogusArg", "bogusValue");
      fail();
    } catch (IllegalArgumentException expected) {
      assertTrue(expected.getMessage().contains("Unknown parameters"));
    }
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java
@ -0,0 +1,100 @@
 package org.apache.lucene.analysis.ckb;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary;
 import java.io.IOException;
 import java.io.Reader;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 /**
 * Test the Sorani Stemmer.
 */
 public class TestSoraniStemFilter extends BaseTokenStreamTestCase {
  SoraniAnalyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT);
  public void testIndefiniteSingular() throws Exception {
    checkOneTerm(a, "پیاوێک", "پیاو"); // -ek
    checkOneTerm(a, "دەرگایەک", "دەرگا"); // -yek
  }
  public void testDefiniteSingular() throws Exception {
    checkOneTerm(a, "پیاوەكە", "پیاو"); // -aka
    checkOneTerm(a, "دەرگاكە", "دەرگا"); // -ka
  }
  public void testDemonstrativeSingular() throws Exception {
    checkOneTerm(a, "کتاویە", "کتاوی"); // -a
    checkOneTerm(a, "دەرگایە", "دەرگا"); // -ya
  }
  public void testIndefinitePlural() throws Exception {
    checkOneTerm(a, "پیاوان", "پیاو"); // -An
    checkOneTerm(a, "دەرگایان", "دەرگا"); // -yAn
  }
  public void testDefinitePlural() throws Exception {
    checkOneTerm(a, "پیاوەکان", "پیاو"); // -akAn
    checkOneTerm(a, "دەرگاکان", "دەرگا"); // -kAn
  }
  public void testDemonstrativePlural() throws Exception {
    checkOneTerm(a, "پیاوانە", "پیاو"); // -Ana
    checkOneTerm(a, "دەرگایانە", "دەرگا"); // -yAna
  }
  public void testEzafe() throws Exception {
    checkOneTerm(a, "هۆتیلی", "هۆتیل"); // singular
    checkOneTerm(a, "هۆتیلێکی", "هۆتیل"); // indefinite
    checkOneTerm(a, "هۆتیلانی", "هۆتیل"); // plural
  }
  public void testPostpositions() throws Exception {
    checkOneTerm(a, "دوورەوە", "دوور"); // -awa
    checkOneTerm(a, "نیوەشەودا", "نیوەشەو"); // -dA
    checkOneTerm(a, "سۆرانا", "سۆران"); // -A
  }
  public void testPossessives() throws Exception {
    checkOneTerm(a, "پارەمان", "پارە"); // -mAn
    checkOneTerm(a, "پارەتان", "پارە"); // -tAn
    checkOneTerm(a, "پارەیان", "پارە"); // -yAn
  }
  public void testEmptyTerm() throws IOException {
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new KeywordTokenizer(reader);
        return new TokenStreamComponents(tokenizer, new SoraniStemFilter(tokenizer));
      }
    };
    checkOneTerm(a, "", "");
  }
  /** test against a basic vocabulary file */
  public void testVocabulary() throws Exception {
    // top 8k words or so: freq > 1000
    assertVocabulary(a, getDataFile("ckbtestdata.zip"), "testdata.txt");
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilterFactory.java
@ -0,0 +1,48 @@
 package org.apache.lucene.analysis.ckb;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.Reader;
 import java.io.StringReader;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
 /**
 * Simple tests to ensure the Sorani stem factory is working.
 */
 public class TestSoraniStemFilterFactory extends BaseTokenStreamFactoryTestCase {
  public void testStemming() throws Exception {
    Reader reader = new StringReader("پیاوەکان");
    TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
    stream = tokenFilterFactory("SoraniStem").create(stream);
    assertTokenStreamContents(stream, new String[] { "پیاو" });
  }
  /** Test that bogus arguments result in exception */
  public void testBogusArguments() throws Exception {
    try {
      tokenFilterFactory("SoraniStem", "bogusArg", "bogusValue");
      fail();
    } catch (IllegalArgumentException expected) {
      assertTrue(expected.getMessage().contains("Unknown parameters"));
    }
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/ckbtestdata.zip
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/ckbtestdata.zip
--- a/solr/build.xml
+++ b/solr/build.xml
@ -612,7 +612,7 @@
  <property name="analysis-common.res.dir"  value="../lucene/analysis/common/src/resources/org/apache/lucene/analysis"/>
  <property name="analysis-kuromoji.res.dir"  value="../lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis"/>
-  <property name="analysis.conf.dest" value="${example}/solr/conf/lang"/>
+  <property name="analysis.conf.dest" value="${example}/solr/collection1/conf/lang"/>
  <target name="sync-analyzers"
          description="Committers' Helper: synchronizes analysis resources (e.g. stoplists) to the example">
@ -625,6 +625,9 @@
    <!-- catalan -->
    <copy verbose="true" file="${analysis-common.res.dir}/ca/stopwords.txt"
                         tofile="${analysis.conf.dest}/stopwords_ca.txt"/>
    <!-- kurdish -->
    <copy verbose="true" file="${analysis-common.res.dir}/ckb/stopwords.txt"
                         tofile="${analysis.conf.dest}/stopwords_ckb.txt"/>
    <!-- czech -->
    <copy verbose="true" file="${analysis-common.res.dir}/cz/stopwords.txt"
                         tofile="${analysis.conf.dest}/stopwords_cz.txt"/>
--- a/solr/example/solr/collection1/conf/lang/stopwords_ckb.txt
+++ b/solr/example/solr/collection1/conf/lang/stopwords_ckb.txt
@ -0,0 +1,136 @@
 # set of kurdish stopwords
 # note these have been normalized with our scheme (e represented with U+06D5, etc)
 # constructed from:
 # * Fig 5 of "Building A Test Collection For Sorani Kurdish" (Esmaili et al)
 # * "Sorani Kurdish: A Reference Grammar with selected readings" (Thackston)
 # * Corpus-based analysis of 77M word Sorani collection: wikipedia, news, blogs, etc
 # and
 و
 # which
 کە
 # of
 ی
 # made/did
 کرد
 # that/which
 ئەوەی
 # on/head
 سەر
 # two
 دوو
 # also
 هەروەها
 # from/that
 لەو
 # makes/does
 دەکات
 # some
 چەند
 # every
 هەر
 # demonstratives
 # that
 ئەو
 # this
 ئەم
 # personal pronouns
 # I
 من
 # we
 ئێمە
 # you
 تۆ
 # you
 ئێوە
 # he/she/it
 ئەو
 # they
 ئەوان
 # prepositions
 # to/with/by
 بە
 پێ
 # without
 بەبێ
 # along with/while/during
 بەدەم
 # in the opinion of
 بەلای
 # according to
 بەپێی
 # before
 بەرلە
 # in the direction of
 بەرەوی
 # in front of/toward
 بەرەوە
 # before/in the face of
 بەردەم
 # without
 بێ
 # except for
 بێجگە
 # for
 بۆ
 # on/in
 دە
 تێ
 # with
 دەگەڵ
 # after
 دوای
 # except for/aside from
 جگە
 # in/from
 لە
 لێ
 # in front of/before/because of
 لەبەر
 # between/among
 لەبەینی
 # concerning/about
 لەبابەت
 # concerning
 لەبارەی
 # instead of
 لەباتی
 # beside
 لەبن
 # instead of
 لەبرێتی
 # behind
 لەدەم
 # with/together with
 لەگەڵ
 # by
 لەلایەن
 # within
 لەناو
 # between/among
 لەنێو
 # for the sake of
 لەپێناوی
 # with respect to
 لەرەوی
 # by means of/for
 لەرێ
 # for the sake of
 لەرێگا
 # on/on top of/according to
 لەسەر
 # under
 لەژێر
 # between/among
 ناو
 # between/among
 نێوان
 # after
 پاش
 # before
 پێش
 # like
 وەک
--- a/solr/example/solr/collection1/conf/schema.xml
+++ b/solr/example/solr/collection1/conf/schema.xml
@ -779,6 +779,18 @@
      </analyzer>
    </fieldType>
    <!-- Kurdish -->
    <fieldType name="text_ckb" class="solr.TextField" positionIncrementGap="100">
      <analyzer>
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.SoraniNormalizationFilterFactory"/>
        <!-- for any latin text -->
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ckb.txt"/>
        <filter class="solr.SoraniStemFilterFactory"/>
      </analyzer>
    </fieldType>
    <!-- Czech -->
    <fieldType name="text_cz" class="solr.TextField" positionIncrementGap="100">
      <analyzer>