LUCENE-5379: Kurdish Analyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1555359 13f79535-47bb-0310-9956-ffa450edef68
2014-01-04 16:05:50 +00:00 · 2014-01-04 16:05:50 +00:00 · 2140f4368a
parent 9d0b60388d
commit 2140f4368a
20 changed files with 1239 additions and 1 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -81,6 +81,8 @@ New Features
  matter in practice if the number of ranges is over 10 or so.  (Mike
  McCandless)

+* LUCENE-5379: Add Analyzer for Kurdish.  (Robert Muir)
+
 Build

 * LUCENE-5217: Maven config: get dependencies from Ant+Ivy config; disable
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
@ -0,0 +1,130 @@
+package org.apache.lucene.analysis.ckb;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.Version;
+
+/**
+ * {@link Analyzer} for Sorani Kurdish.
+ */
+public final class SoraniAnalyzer extends StopwordAnalyzerBase {
+  private final CharArraySet stemExclusionSet;
+  
+  /** File containing default Kurdish stopwords. */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+  
+  /**
+   * Returns an unmodifiable instance of the default stop words set.
+   * @return default stop words set.
+   */
+  public static CharArraySet getDefaultStopSet() {
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final CharArraySet DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(SoraniAnalyzer.class, 
+            DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+  }
+
+  /**
+   * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public SoraniAnalyzer(Version matchVersion) {
+    this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words.
+   * 
+   * @param matchVersion lucene compatibility version
+   * @param stopwords a stopword set
+   */
+  public SoraniAnalyzer(Version matchVersion, CharArraySet stopwords) {
+    this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+  }
+
+  /**
+   * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+   * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
+   * stemming.
+   * 
+   * @param matchVersion lucene compatibility version
+   * @param stopwords a stopword set
+   * @param stemExclusionSet a set of terms not to be stemmed
+   */
+  public SoraniAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
+    super(matchVersion, stopwords);
+    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+        matchVersion, stemExclusionSet));
+  }
+
+  /**
+   * Creates a
+   * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+   * which tokenizes all the text in the provided {@link Reader}.
+   * 
+   * @return A
+   *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+   *         built from an {@link StandardTokenizer} filtered with
+   *         {@link StandardFilter}, {@link SoraniNormalizationFilter}, 
+   *         {@link LowerCaseFilter}, {@link StopFilter}
+   *         , {@link SetKeywordMarkerFilter} if a stem exclusion set is
+   *         provided and {@link SoraniStemFilter}.
+   */
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new StandardFilter(matchVersion, source);
+    result = new SoraniNormalizationFilter(result);
+    result = new LowerCaseFilter(matchVersion, result);
+    result = new StopFilter(matchVersion, result, stopwords);
+    if(!stemExclusionSet.isEmpty())
+      result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+    result = new SoraniStemFilter(result);
+    return new TokenStreamComponents(source, result);
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizationFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizationFilter.java
@ -0,0 +1,47 @@
+package org.apache.lucene.analysis.ckb;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link SoraniNormalizer} to normalize the
+ * orthography.
+ */
+public final class SoraniNormalizationFilter extends TokenFilter {
+  private final SoraniNormalizer normalizer = new SoraniNormalizer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+  public SoraniNormalizationFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      final int newlen = normalizer.normalize(termAtt.buffer(), termAtt.length());
+      termAtt.setLength(newlen);
+      return true;
+    } 
+    return false;
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizationFilterFactory.java
@ -0,0 +1,56 @@
+package org.apache.lucene.analysis.ckb;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
+import org.apache.lucene.analysis.util.MultiTermAwareComponent;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link SoraniNormalizationFilter}.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_ckbnormal" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.SoraniNormalizationFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ */
+public class SoraniNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
+
+  /** Creates a new SoraniNormalizationFilterFactory */
+  public SoraniNormalizationFilterFactory(Map<String,String> args) {
+    super(args);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public SoraniNormalizationFilter create(TokenStream input) {
+    return new SoraniNormalizationFilter(input);
+  }
+
+  @Override
+  public AbstractAnalysisFactory getMultiTermComponent() {
+    return this;
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniNormalizer.java
@ -0,0 +1,127 @@
+package org.apache.lucene.analysis.ckb;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import static org.apache.lucene.analysis.util.StemmerUtil.delete;
+
+/** 
+ * Normalizes the Unicode representation of Sorani text.
+ * <p>
+ * Normalization consists of:
+ * <ul>
+ *   <li>Alternate forms of 'y' (0064, 0649) are converted to 06CC (FARSI YEH)
+ *   <li>Alternate form of 'k' (0643) is converted to 06A9 (KEHEH)
+ *   <li>Alternate forms of vowel 'e' (0647+200C, word-final 0647, 0629) are converted to 06D5 (AE)
+ *   <li>Alternate (joining) form of 'h' (06BE) is converted to 0647
+ *   <li>Alternate forms of 'rr' (0692, word-initial 0631) are converted to 0695 (REH WITH SMALL V BELOW)
+ *   <li>Harakat, tatweel, and formatting characters such as directional controls are removed.
+ * </ul>
+ */
+public class SoraniNormalizer {
+  
+  static final char YEH = '\u064A';
+  static final char DOTLESS_YEH = '\u0649';
+  static final char FARSI_YEH = '\u06CC';
+  
+  static final char KAF = '\u0643';
+  static final char KEHEH = '\u06A9';
+  
+  static final char HEH = '\u0647';
+  static final char AE = '\u06D5';
+  static final char ZWNJ = '\u200C';
+  static final char HEH_DOACHASHMEE = '\u06BE';
+  static final char TEH_MARBUTA = '\u0629';
+      
+  static final char REH = '\u0631';
+  static final char RREH = '\u0695';
+  static final char RREH_ABOVE = '\u0692';
+  
+  static final char TATWEEL = '\u0640';
+  static final char FATHATAN = '\u064B';
+  static final char DAMMATAN = '\u064C';
+  static final char KASRATAN = '\u064D';
+  static final char FATHA = '\u064E';
+  static final char DAMMA = '\u064F';
+  static final char KASRA = '\u0650';
+  static final char SHADDA = '\u0651';
+  static final char SUKUN = '\u0652';
+
+  /**
+   * Normalize an input buffer of Sorani text
+   * 
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   */
+  public int normalize(char s[], int len) {
+    for (int i = 0; i < len; i++) {
+      switch (s[i]) {
+        case YEH:
+        case DOTLESS_YEH:
+          s[i] = FARSI_YEH;
+          break;
+        case KAF:
+          s[i] = KEHEH;
+          break;
+        case ZWNJ:
+          if (i > 0 && s[i-1] == HEH) {
+            s[i-1] = AE;
+          }
+          len = delete(s, i, len);
+          i--;
+          break;
+        case HEH:
+          if (i == len-1) {
+            s[i] = AE;
+          }
+          break;
+        case TEH_MARBUTA:
+          s[i] = AE;
+          break;
+        case HEH_DOACHASHMEE:
+          s[i] = HEH;
+          break;
+        case REH:
+          if (i == 0) {
+            s[i] = RREH;
+          }
+          break;
+        case RREH_ABOVE:
+          s[i] = RREH;
+          break;
+        case TATWEEL:
+        case KASRATAN:
+        case DAMMATAN:
+        case FATHATAN:
+        case FATHA:
+        case DAMMA:
+        case KASRA:
+        case SHADDA:
+        case SUKUN:
+          len = delete(s, i, len);
+          i--;
+          break;
+        default:
+          if (Character.getType(s[i]) == Character.FORMAT) {
+            len = delete(s, i, len);
+            i--;
+          }
+      }
+    }
+    return len;
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemFilter.java
@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.ckb;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // javadoc @link
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link SoraniStemmer} to stem Sorani words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * @see SetKeywordMarkerFilter */
+
+public final class SoraniStemFilter extends TokenFilter {
+  private final SoraniStemmer stemmer = new SoraniStemmer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+  
+  public SoraniStemFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if(!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+        termAtt.setLength(newlen);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemFilterFactory.java
@ -0,0 +1,50 @@
+package org.apache.lucene.analysis.ckb;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link SoraniStemFilter}.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_ckbstem" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.SoraniNormalizationFilterFactory"/&gt;
+ *     &lt;filter class="solr.SoraniStemFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ */
+public class SoraniStemFilterFactory extends TokenFilterFactory {
+
+  /** Creates a new SoraniStemFilterFactory */
+  public SoraniStemFilterFactory(Map<String,String> args) {
+    super(args);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public SoraniStemFilter create(TokenStream input) {
+    return new SoraniStemFilter(input);
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniStemmer.java
@ -0,0 +1,103 @@
+package org.apache.lucene.analysis.ckb;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
+
+/**
+ * Light stemmer for Sorani
+ */
+public class SoraniStemmer {
+  
+  /**
+   * Stem an input buffer of Sorani text.
+   * 
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   */
+  public int stem(char s[], int len) {
+    // postposition
+    if (len > 5 && endsWith(s, len, "دا")) {
+      len -= 2;
+    } else if (len > 4 && endsWith(s, len, "نا")) {
+      len--;
+    } else if (len > 6 && endsWith(s, len, "ەوە")) {
+      len -= 3;
+    }
+    
+    // possessive pronoun
+    if (len > 6 && (endsWith(s, len, "مان") || endsWith(s, len, "یان") || endsWith(s, len, "تان"))) {
+      len -= 3;
+    }
+    
+    // indefinite singular ezafe
+    if (len > 6 && endsWith(s, len, "ێکی")) {
+      return len-3;
+    } else if (len > 7 && endsWith(s, len, "یەکی")) {
+      return len-4;
+    }
+    // indefinite singular
+    if (len > 5 && endsWith(s, len, "ێک")) {
+      return len-2;
+    } else if (len > 6 && endsWith(s, len, "یەک")) {
+      return len-3;
+    }
+    // definite singular
+    else if (len > 6 && endsWith(s, len, "ەکە")) {
+      return len-3;
+    } else if (len > 5 && endsWith(s, len, "کە")) {
+      return len-2;
+    }
+    // definite plural
+    else if (len > 7 && endsWith(s, len, "ەکان")) {
+      return len-4;
+    } else if (len > 6 && endsWith(s, len, "کان")) {
+      return len-3;
+    }
+    // indefinite plural ezafe
+    else if (len > 7 && endsWith(s, len, "یانی")) {
+      return len-4;
+    } else if (len > 6 && endsWith(s, len, "انی")) {
+      return len-3;
+    }
+    // indefinite plural
+    else if (len > 6 && endsWith(s, len, "یان")) {
+      return len-3;
+    } else if (len > 5 && endsWith(s, len, "ان")) {
+      return len-2;
+    } 
+    // demonstrative plural
+    else if (len > 7 && endsWith(s, len, "یانە")) {
+      return len-4;
+    } else if (len > 6 && endsWith(s, len, "انە")) {
+      return len-3;
+    }
+    // demonstrative singular
+    else if (len > 5 && (endsWith(s, len, "ایە") || endsWith(s, len, "ەیە"))) {
+      return len-2;
+    } else if (len > 4 && endsWith(s, len, "ە")) {
+      return len-1;
+    }
+    // absolute singular ezafe
+    else if (len > 4 && endsWith(s, len, "ی")) {
+      return len-1;
+    }
+    return len;
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/package.html
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/package.html
@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Sorani Kurdish.
+</body>
+</html>
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@ -19,6 +19,8 @@ org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
 org.apache.lucene.analysis.br.BrazilianStemFilterFactory
 org.apache.lucene.analysis.cjk.CJKBigramFilterFactory
 org.apache.lucene.analysis.cjk.CJKWidthFilterFactory
+org.apache.lucene.analysis.ckb.SoraniNormalizationFilterFactory
+org.apache.lucene.analysis.ckb.SoraniStemFilterFactory
 org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory
 org.apache.lucene.analysis.commongrams.CommonGramsQueryFilterFactory
 org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilterFactory
--- a/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ckb/stopwords.txt
+++ b/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ckb/stopwords.txt
@ -0,0 +1,136 @@
+# set of kurdish stopwords
+# note these have been normalized with our scheme (e represented with U+06D5, etc)
+# constructed from:
+# * Fig 5 of "Building A Test Collection For Sorani Kurdish" (Esmaili et al)
+# * "Sorani Kurdish: A Reference Grammar with selected readings" (Thackston)
+# * Corpus-based analysis of 77M word Sorani collection: wikipedia, news, blogs, etc
+
+# and
+و
+# which
+کە
+# of
+ی
+# made/did
+کرد
+# that/which
+ئەوەی
+# on/head
+سەر
+# two
+دوو
+# also
+هەروەها
+# from/that
+لەو
+# makes/does
+دەکات
+# some
+چەند
+# every
+هەر
+
+# demonstratives
+# that
+ئەو
+# this
+ئەم
+
+# personal pronouns
+# I
+من
+# we
+ئێمە
+# you
+تۆ
+# you
+ئێوە
+# he/she/it
+ئەو
+# they
+ئەوان
+
+# prepositions
+# to/with/by
+بە
+پێ
+# without
+بەبێ
+# along with/while/during
+بەدەم
+# in the opinion of
+بەلای
+# according to
+بەپێی
+# before
+بەرلە
+# in the direction of
+بەرەوی
+# in front of/toward
+بەرەوە
+# before/in the face of
+بەردەم
+# without
+بێ
+# except for
+بێجگە
+# for
+بۆ
+# on/in
+دە
+تێ
+# with
+دەگەڵ
+# after
+دوای
+# except for/aside from
+جگە
+# in/from
+لە
+لێ
+# in front of/before/because of
+لەبەر
+# between/among
+لەبەینی
+# concerning/about
+لەبابەت
+# concerning
+لەبارەی
+# instead of
+لەباتی
+# beside
+لەبن
+# instead of
+لەبرێتی
+# behind
+لەدەم
+# with/together with
+لەگەڵ
+# by
+لەلایەن
+# within
+لەناو
+# between/among
+لەنێو
+# for the sake of
+لەپێناوی
+# with respect to
+لەرەوی
+# by means of/for
+لەرێ
+# for the sake of
+لەرێگا
+# on/on top of/according to
+لەسەر
+# under
+لەژێر
+# between/among
+ناو
+# between/among
+نێوان
+# after
+پاش
+# before
+پێش
+# like
+وەک
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java
@ -0,0 +1,66 @@
+package org.apache.lucene.analysis.ckb;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.util.CharArraySet;
+
+/**
+ * Test the Sorani analyzer
+ */
+public class TestSoraniAnalyzer extends BaseTokenStreamTestCase {
+  
+  /**
+   * This test fails with NPE when the stopwords file is missing in classpath
+   */
+  public void testResourcesAvailable() {
+    new SoraniAnalyzer(TEST_VERSION_CURRENT);
+  }
+  
+  public void testStopwords() throws IOException {
+    Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT);
+    assertAnalyzesTo(a, "ئەم پیاوە", new String[] {"پیاو"});
+  }
+  
+  public void testCustomStopwords() throws IOException {
+    Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
+    assertAnalyzesTo(a, "ئەم پیاوە", 
+        new String[] {"ئەم", "پیاو"});
+  }
+  
+  public void testReusableTokenStream() throws IOException {
+    Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT);
+    assertAnalyzesTo(a, "پیاوە", new String[] {"پیاو"});
+    assertAnalyzesTo(a, "پیاو", new String[] {"پیاو"});
+  }
+  
+  public void testWithStemExclusionSet() throws IOException {
+    CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
+    set.add("پیاوە");
+    Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
+    assertAnalyzesTo(a, "پیاوە", new String[] { "پیاوە" });
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random(), new SoraniAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniNormalizationFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniNormalizationFilter.java
@ -0,0 +1,92 @@
+package org.apache.lucene.analysis.ckb;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+
+/**
+ * Tests normalization for Sorani (this is more critical than stemming...)
+ */
+public class TestSoraniNormalizationFilter extends BaseTokenStreamTestCase {
+  Analyzer a = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new KeywordTokenizer(reader);
+      return new TokenStreamComponents(tokenizer, new SoraniNormalizationFilter(tokenizer));
+    }
+  };
+  
+  public void testY() throws Exception {
+    checkOneTerm(a, "\u064A", "\u06CC");
+    checkOneTerm(a, "\u0649", "\u06CC");
+    checkOneTerm(a, "\u06CC", "\u06CC");
+  }
+  
+  public void testK() throws Exception {
+    checkOneTerm(a, "\u0643", "\u06A9");
+    checkOneTerm(a, "\u06A9", "\u06A9");
+  }
+  
+  public void testH() throws Exception {
+    // initial
+    checkOneTerm(a, "\u0647\u200C", "\u06D5");
+    // medial
+    checkOneTerm(a, "\u0647\u200C\u06A9", "\u06D5\u06A9");
+    
+    checkOneTerm(a, "\u06BE", "\u0647");
+    checkOneTerm(a, "\u0629", "\u06D5");
+  }
+  
+  public void testFinalH() throws Exception {
+    // always (and in final form by def), so frequently omitted
+    checkOneTerm(a, "\u0647\u0647\u0647", "\u0647\u0647\u06D5");
+  }
+  
+  public void testRR() throws Exception {
+    checkOneTerm(a, "\u0692", "\u0695");
+  }
+  
+  public void testInitialRR() throws Exception {
+    // always, so frequently omitted
+    checkOneTerm(a, "\u0631\u0631\u0631", "\u0695\u0631\u0631");
+  }
+  
+  public void testRemove() throws Exception {
+    checkOneTerm(a, "\u0640", "");
+    checkOneTerm(a, "\u064B", "");
+    checkOneTerm(a, "\u064C", "");
+    checkOneTerm(a, "\u064D", "");
+    checkOneTerm(a, "\u064E", "");
+    checkOneTerm(a, "\u064F", "");
+    checkOneTerm(a, "\u0650", "");
+    checkOneTerm(a, "\u0651", "");
+    checkOneTerm(a, "\u0652", "");
+    // we peek backwards in this case to look for h+200C, ensure this works
+    checkOneTerm(a, "\u200C", "");
+  }
+  
+  public void testEmptyTerm() throws IOException {
+    checkOneTerm(a, "", "");
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniNormalizationFilterFactory.java
@ -0,0 +1,48 @@
+package org.apache.lucene.analysis.ckb;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+/**
+ * Simple tests to ensure the Sorani normalization factory is working.
+ */
+public class TestSoraniNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
+  
+  public void testNormalization() throws Exception {
+    Reader reader = new StringReader("پیــــاوەکان");
+    TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+    stream = tokenFilterFactory("SoraniNormalization").create(stream);
+    assertTokenStreamContents(stream, new String[] { "پیاوەکان" });
+  }
+  
+  /** Test that bogus arguments result in exception */
+  public void testBogusArguments() throws Exception {
+    try {
+      tokenFilterFactory("SoraniNormalization", "bogusArg", "bogusValue");
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("Unknown parameters"));
+    }
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java
@ -0,0 +1,100 @@
+package org.apache.lucene.analysis.ckb;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary;
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+
+/**
+ * Test the Sorani Stemmer.
+ */
+public class TestSoraniStemFilter extends BaseTokenStreamTestCase {
+  SoraniAnalyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT);
+  
+  public void testIndefiniteSingular() throws Exception {
+    checkOneTerm(a, "پیاوێک", "پیاو"); // -ek
+    checkOneTerm(a, "دەرگایەک", "دەرگا"); // -yek
+  }
+  
+  public void testDefiniteSingular() throws Exception {
+    checkOneTerm(a, "پیاوەكە", "پیاو"); // -aka
+    checkOneTerm(a, "دەرگاكە", "دەرگا"); // -ka
+  }
+  
+  public void testDemonstrativeSingular() throws Exception {
+    checkOneTerm(a, "کتاویە", "کتاوی"); // -a
+    checkOneTerm(a, "دەرگایە", "دەرگا"); // -ya
+  }
+  
+  public void testIndefinitePlural() throws Exception {
+    checkOneTerm(a, "پیاوان", "پیاو"); // -An
+    checkOneTerm(a, "دەرگایان", "دەرگا"); // -yAn
+  }
+  
+  public void testDefinitePlural() throws Exception {
+    checkOneTerm(a, "پیاوەکان", "پیاو"); // -akAn
+    checkOneTerm(a, "دەرگاکان", "دەرگا"); // -kAn
+  }
+  
+  public void testDemonstrativePlural() throws Exception {
+    checkOneTerm(a, "پیاوانە", "پیاو"); // -Ana
+    checkOneTerm(a, "دەرگایانە", "دەرگا"); // -yAna
+  }
+  
+  public void testEzafe() throws Exception {
+    checkOneTerm(a, "هۆتیلی", "هۆتیل"); // singular
+    checkOneTerm(a, "هۆتیلێکی", "هۆتیل"); // indefinite
+    checkOneTerm(a, "هۆتیلانی", "هۆتیل"); // plural
+  }
+  
+  public void testPostpositions() throws Exception {
+    checkOneTerm(a, "دوورەوە", "دوور"); // -awa
+    checkOneTerm(a, "نیوەشەودا", "نیوەشەو"); // -dA
+    checkOneTerm(a, "سۆرانا", "سۆران"); // -A
+  }
+  
+  public void testPossessives() throws Exception {
+    checkOneTerm(a, "پارەمان", "پارە"); // -mAn
+    checkOneTerm(a, "پارەتان", "پارە"); // -tAn
+    checkOneTerm(a, "پارەیان", "پارە"); // -yAn
+  }
+  
+  public void testEmptyTerm() throws IOException {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new KeywordTokenizer(reader);
+        return new TokenStreamComponents(tokenizer, new SoraniStemFilter(tokenizer));
+      }
+    };
+    checkOneTerm(a, "", "");
+  }
+  
+  /** test against a basic vocabulary file */
+  public void testVocabulary() throws Exception {
+    // top 8k words or so: freq > 1000
+    assertVocabulary(a, getDataFile("ckbtestdata.zip"), "testdata.txt");
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilterFactory.java
@ -0,0 +1,48 @@
+package org.apache.lucene.analysis.ckb;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+/**
+ * Simple tests to ensure the Sorani stem factory is working.
+ */
+public class TestSoraniStemFilterFactory extends BaseTokenStreamFactoryTestCase {
+  
+  public void testStemming() throws Exception {
+    Reader reader = new StringReader("پیاوەکان");
+    TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+    stream = tokenFilterFactory("SoraniStem").create(stream);
+    assertTokenStreamContents(stream, new String[] { "پیاو" });
+  }
+  
+  /** Test that bogus arguments result in exception */
+  public void testBogusArguments() throws Exception {
+    try {
+      tokenFilterFactory("SoraniStem", "bogusArg", "bogusValue");
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("Unknown parameters"));
+    }
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/ckbtestdata.zip
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/ckbtestdata.zip
--- a/solr/build.xml
+++ b/solr/build.xml
@ -612,7 +612,7 @@
  
  <property name="analysis-common.res.dir"  value="../lucene/analysis/common/src/resources/org/apache/lucene/analysis"/>
  <property name="analysis-kuromoji.res.dir"  value="../lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis"/>
-  <property name="analysis.conf.dest" value="${example}/solr/conf/lang"/>
+  <property name="analysis.conf.dest" value="${example}/solr/collection1/conf/lang"/>

  <target name="sync-analyzers"
          description="Committers' Helper: synchronizes analysis resources (e.g. stoplists) to the example">
@ -625,6 +625,9 @@
    <!-- catalan -->
    <copy verbose="true" file="${analysis-common.res.dir}/ca/stopwords.txt"
                         tofile="${analysis.conf.dest}/stopwords_ca.txt"/>
+    <!-- kurdish -->
+    <copy verbose="true" file="${analysis-common.res.dir}/ckb/stopwords.txt"
+                         tofile="${analysis.conf.dest}/stopwords_ckb.txt"/>
    <!-- czech -->
    <copy verbose="true" file="${analysis-common.res.dir}/cz/stopwords.txt"
                         tofile="${analysis.conf.dest}/stopwords_cz.txt"/>
--- a/solr/example/solr/collection1/conf/lang/stopwords_ckb.txt
+++ b/solr/example/solr/collection1/conf/lang/stopwords_ckb.txt
@ -0,0 +1,136 @@
+# set of kurdish stopwords
+# note these have been normalized with our scheme (e represented with U+06D5, etc)
+# constructed from:
+# * Fig 5 of "Building A Test Collection For Sorani Kurdish" (Esmaili et al)
+# * "Sorani Kurdish: A Reference Grammar with selected readings" (Thackston)
+# * Corpus-based analysis of 77M word Sorani collection: wikipedia, news, blogs, etc
+
+# and
+و
+# which
+کە
+# of
+ی
+# made/did
+کرد
+# that/which
+ئەوەی
+# on/head
+سەر
+# two
+دوو
+# also
+هەروەها
+# from/that
+لەو
+# makes/does
+دەکات
+# some
+چەند
+# every
+هەر
+
+# demonstratives
+# that
+ئەو
+# this
+ئەم
+
+# personal pronouns
+# I
+من
+# we
+ئێمە
+# you
+تۆ
+# you
+ئێوە
+# he/she/it
+ئەو
+# they
+ئەوان
+
+# prepositions
+# to/with/by
+بە
+پێ
+# without
+بەبێ
+# along with/while/during
+بەدەم
+# in the opinion of
+بەلای
+# according to
+بەپێی
+# before
+بەرلە
+# in the direction of
+بەرەوی
+# in front of/toward
+بەرەوە
+# before/in the face of
+بەردەم
+# without
+بێ
+# except for
+بێجگە
+# for
+بۆ
+# on/in
+دە
+تێ
+# with
+دەگەڵ
+# after
+دوای
+# except for/aside from
+جگە
+# in/from
+لە
+لێ
+# in front of/before/because of
+لەبەر
+# between/among
+لەبەینی
+# concerning/about
+لەبابەت
+# concerning
+لەبارەی
+# instead of
+لەباتی
+# beside
+لەبن
+# instead of
+لەبرێتی
+# behind
+لەدەم
+# with/together with
+لەگەڵ
+# by
+لەلایەن
+# within
+لەناو
+# between/among
+لەنێو
+# for the sake of
+لەپێناوی
+# with respect to
+لەرەوی
+# by means of/for
+لەرێ
+# for the sake of
+لەرێگا
+# on/on top of/according to
+لەسەر
+# under
+لەژێر
+# between/among
+ناو
+# between/among
+نێوان
+# after
+پاش
+# before
+پێش
+# like
+وەک
--- a/solr/example/solr/collection1/conf/schema.xml
+++ b/solr/example/solr/collection1/conf/schema.xml
@ -779,6 +779,18 @@
      </analyzer>
    </fieldType>

+    <!-- Kurdish -->
+    <fieldType name="text_ckb" class="solr.TextField" positionIncrementGap="100">
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.SoraniNormalizationFilterFactory"/>
+        <!-- for any latin text -->
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ckb.txt"/>
+        <filter class="solr.SoraniStemFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
    <!-- Czech -->
    <fieldType name="text_cz" class="solr.TextField" positionIncrementGap="100">
      <analyzer>