LUCENE-5379: Kurdish Analyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1555359 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-01-04 16:05:50 +00:00
parent 9d0b60388d
commit 2140f4368a
20 changed files with 1239 additions and 1 deletions

View File

@ -81,6 +81,8 @@ New Features
matter in practice if the number of ranges is over 10 or so. (Mike matter in practice if the number of ranges is over 10 or so. (Mike
McCandless) McCandless)
* LUCENE-5379: Add Analyzer for Kurdish. (Robert Muir)
Build Build
* LUCENE-5217: Maven config: get dependencies from Ant+Ivy config; disable * LUCENE-5217: Maven config: get dependencies from Ant+Ivy config; disable

View File

@ -0,0 +1,130 @@
package org.apache.lucene.analysis.ckb;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/**
* {@link Analyzer} for Sorani Kurdish.
*/
public final class SoraniAnalyzer extends StopwordAnalyzerBase {
private final CharArraySet stemExclusionSet;
/** File containing default Kurdish stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
* Returns an unmodifiable instance of the default stop words set.
* @return default stop words set.
*/
public static CharArraySet getDefaultStopSet() {
return DefaultSetHolder.DEFAULT_STOP_SET;
}
/**
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(SoraniAnalyzer.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
}
/**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/
public SoraniAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Builds an analyzer with the given stop words.
*
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set
*/
public SoraniAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
}
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed
*/
public SoraniAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
}
/**
* Creates a
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* which tokenizes all the text in the provided {@link Reader}.
*
* @return A
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link SoraniNormalizationFilter},
* {@link LowerCaseFilter}, {@link StopFilter}
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SoraniStemFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
result = new SoraniNormalizationFilter(result);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SoraniStemFilter(result);
return new TokenStreamComponents(source, result);
}
}

View File

@ -0,0 +1,47 @@
package org.apache.lucene.analysis.ckb;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* A {@link TokenFilter} that applies {@link SoraniNormalizer} to normalize the
* orthography.
*/
public final class SoraniNormalizationFilter extends TokenFilter {
private final SoraniNormalizer normalizer = new SoraniNormalizer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
public SoraniNormalizationFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
final int newlen = normalizer.normalize(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
return true;
}
return false;
}
}

View File

@ -0,0 +1,56 @@
package org.apache.lucene.analysis.ckb;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link SoraniNormalizationFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_ckbnormal" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.SoraniNormalizationFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class SoraniNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
/** Creates a new SoraniNormalizationFilterFactory */
public SoraniNormalizationFilterFactory(Map<String,String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public SoraniNormalizationFilter create(TokenStream input) {
return new SoraniNormalizationFilter(input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
}
}

View File

@ -0,0 +1,127 @@
package org.apache.lucene.analysis.ckb;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.analysis.util.StemmerUtil.delete;
/**
* Normalizes the Unicode representation of Sorani text.
* <p>
* Normalization consists of:
* <ul>
* <li>Alternate forms of 'y' (0064, 0649) are converted to 06CC (FARSI YEH)
* <li>Alternate form of 'k' (0643) is converted to 06A9 (KEHEH)
* <li>Alternate forms of vowel 'e' (0647+200C, word-final 0647, 0629) are converted to 06D5 (AE)
* <li>Alternate (joining) form of 'h' (06BE) is converted to 0647
* <li>Alternate forms of 'rr' (0692, word-initial 0631) are converted to 0695 (REH WITH SMALL V BELOW)
* <li>Harakat, tatweel, and formatting characters such as directional controls are removed.
* </ul>
*/
public class SoraniNormalizer {
static final char YEH = '\u064A';
static final char DOTLESS_YEH = '\u0649';
static final char FARSI_YEH = '\u06CC';
static final char KAF = '\u0643';
static final char KEHEH = '\u06A9';
static final char HEH = '\u0647';
static final char AE = '\u06D5';
static final char ZWNJ = '\u200C';
static final char HEH_DOACHASHMEE = '\u06BE';
static final char TEH_MARBUTA = '\u0629';
static final char REH = '\u0631';
static final char RREH = '\u0695';
static final char RREH_ABOVE = '\u0692';
static final char TATWEEL = '\u0640';
static final char FATHATAN = '\u064B';
static final char DAMMATAN = '\u064C';
static final char KASRATAN = '\u064D';
static final char FATHA = '\u064E';
static final char DAMMA = '\u064F';
static final char KASRA = '\u0650';
static final char SHADDA = '\u0651';
static final char SUKUN = '\u0652';
/**
* Normalize an input buffer of Sorani text
*
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
*/
public int normalize(char s[], int len) {
for (int i = 0; i < len; i++) {
switch (s[i]) {
case YEH:
case DOTLESS_YEH:
s[i] = FARSI_YEH;
break;
case KAF:
s[i] = KEHEH;
break;
case ZWNJ:
if (i > 0 && s[i-1] == HEH) {
s[i-1] = AE;
}
len = delete(s, i, len);
i--;
break;
case HEH:
if (i == len-1) {
s[i] = AE;
}
break;
case TEH_MARBUTA:
s[i] = AE;
break;
case HEH_DOACHASHMEE:
s[i] = HEH;
break;
case REH:
if (i == 0) {
s[i] = RREH;
}
break;
case RREH_ABOVE:
s[i] = RREH;
break;
case TATWEEL:
case KASRATAN:
case DAMMATAN:
case FATHATAN:
case FATHA:
case DAMMA:
case KASRA:
case SHADDA:
case SUKUN:
len = delete(s, i, len);
i--;
break;
default:
if (Character.getType(s[i]) == Character.FORMAT) {
len = delete(s, i, len);
i--;
}
}
}
return len;
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.ckb;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // javadoc @link
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* A {@link TokenFilter} that applies {@link SoraniStemmer} to stem Sorani words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* @see SetKeywordMarkerFilter */
public final class SoraniStemFilter extends TokenFilter {
private final SoraniStemmer stemmer = new SoraniStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public SoraniStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if(!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,50 @@
package org.apache.lucene.analysis.ckb;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link SoraniStemFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_ckbstem" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.SoraniNormalizationFilterFactory"/&gt;
* &lt;filter class="solr.SoraniStemFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class SoraniStemFilterFactory extends TokenFilterFactory {
/** Creates a new SoraniStemFilterFactory */
public SoraniStemFilterFactory(Map<String,String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public SoraniStemFilter create(TokenStream input) {
return new SoraniStemFilter(input);
}
}

View File

@ -0,0 +1,103 @@
package org.apache.lucene.analysis.ckb;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
/**
* Light stemmer for Sorani
*/
public class SoraniStemmer {
/**
* Stem an input buffer of Sorani text.
*
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
*/
public int stem(char s[], int len) {
// postposition
if (len > 5 && endsWith(s, len, "دا")) {
len -= 2;
} else if (len > 4 && endsWith(s, len, "نا")) {
len--;
} else if (len > 6 && endsWith(s, len, "ەوە")) {
len -= 3;
}
// possessive pronoun
if (len > 6 && (endsWith(s, len, "مان") || endsWith(s, len, "یان") || endsWith(s, len, "تان"))) {
len -= 3;
}
// indefinite singular ezafe
if (len > 6 && endsWith(s, len, "ێکی")) {
return len-3;
} else if (len > 7 && endsWith(s, len, "یەکی")) {
return len-4;
}
// indefinite singular
if (len > 5 && endsWith(s, len, "ێک")) {
return len-2;
} else if (len > 6 && endsWith(s, len, "یەک")) {
return len-3;
}
// definite singular
else if (len > 6 && endsWith(s, len, "ەکە")) {
return len-3;
} else if (len > 5 && endsWith(s, len, "کە")) {
return len-2;
}
// definite plural
else if (len > 7 && endsWith(s, len, "ەکان")) {
return len-4;
} else if (len > 6 && endsWith(s, len, "کان")) {
return len-3;
}
// indefinite plural ezafe
else if (len > 7 && endsWith(s, len, "یانی")) {
return len-4;
} else if (len > 6 && endsWith(s, len, "انی")) {
return len-3;
}
// indefinite plural
else if (len > 6 && endsWith(s, len, "یان")) {
return len-3;
} else if (len > 5 && endsWith(s, len, "ان")) {
return len-2;
}
// demonstrative plural
else if (len > 7 && endsWith(s, len, "یانە")) {
return len-4;
} else if (len > 6 && endsWith(s, len, "انە")) {
return len-3;
}
// demonstrative singular
else if (len > 5 && (endsWith(s, len, "ایە") || endsWith(s, len, "ەیە"))) {
return len-2;
} else if (len > 4 && endsWith(s, len, "ە")) {
return len-1;
}
// absolute singular ezafe
else if (len > 4 && endsWith(s, len, "ی")) {
return len-1;
}
return len;
}
}

View File

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Analyzer for Sorani Kurdish.
</body>
</html>

View File

@ -19,6 +19,8 @@ org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
org.apache.lucene.analysis.br.BrazilianStemFilterFactory org.apache.lucene.analysis.br.BrazilianStemFilterFactory
org.apache.lucene.analysis.cjk.CJKBigramFilterFactory org.apache.lucene.analysis.cjk.CJKBigramFilterFactory
org.apache.lucene.analysis.cjk.CJKWidthFilterFactory org.apache.lucene.analysis.cjk.CJKWidthFilterFactory
org.apache.lucene.analysis.ckb.SoraniNormalizationFilterFactory
org.apache.lucene.analysis.ckb.SoraniStemFilterFactory
org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory
org.apache.lucene.analysis.commongrams.CommonGramsQueryFilterFactory org.apache.lucene.analysis.commongrams.CommonGramsQueryFilterFactory
org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilterFactory org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilterFactory

View File

@ -0,0 +1,136 @@
# set of kurdish stopwords
# note these have been normalized with our scheme (e represented with U+06D5, etc)
# constructed from:
# * Fig 5 of "Building A Test Collection For Sorani Kurdish" (Esmaili et al)
# * "Sorani Kurdish: A Reference Grammar with selected readings" (Thackston)
# * Corpus-based analysis of 77M word Sorani collection: wikipedia, news, blogs, etc
# and
و
# which
کە
# of
ی
# made/did
کرد
# that/which
ئەوەی
# on/head
سەر
# two
دوو
# also
هەروەها
# from/that
لەو
# makes/does
دەکات
# some
چەند
# every
هەر
# demonstratives
# that
ئەو
# this
ئەم
# personal pronouns
# I
من
# we
ئێمە
# you
تۆ
# you
ئێوە
# he/she/it
ئەو
# they
ئەوان
# prepositions
# to/with/by
بە
پێ
# without
بەبێ
# along with/while/during
بەدەم
# in the opinion of
بەلای
# according to
بەپێی
# before
بەرلە
# in the direction of
بەرەوی
# in front of/toward
بەرەوە
# before/in the face of
بەردەم
# without
بێ
# except for
بێجگە
# for
بۆ
# on/in
دە
تێ
# with
دەگەڵ
# after
دوای
# except for/aside from
جگە
# in/from
لە
لێ
# in front of/before/because of
لەبەر
# between/among
لەبەینی
# concerning/about
لەبابەت
# concerning
لەبارەی
# instead of
لەباتی
# beside
لەبن
# instead of
لەبرێتی
# behind
لەدەم
# with/together with
لەگەڵ
# by
لەلایەن
# within
لەناو
# between/among
لەنێو
# for the sake of
لەپێناوی
# with respect to
لەرەوی
# by means of/for
لەرێ
# for the sake of
لەرێگا
# on/on top of/according to
لەسەر
# under
لەژێر
# between/among
ناو
# between/among
نێوان
# after
پاش
# before
پێش
# like
وەک

View File

@ -0,0 +1,66 @@
package org.apache.lucene.analysis.ckb;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
/**
* Test the Sorani analyzer
*/
public class TestSoraniAnalyzer extends BaseTokenStreamTestCase {
/**
* This test fails with NPE when the stopwords file is missing in classpath
*/
public void testResourcesAvailable() {
new SoraniAnalyzer(TEST_VERSION_CURRENT);
}
public void testStopwords() throws IOException {
Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesTo(a, "ئەم پیاوە", new String[] {"پیاو"});
}
public void testCustomStopwords() throws IOException {
Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
assertAnalyzesTo(a, "ئەم پیاوە",
new String[] {"ئەم", "پیاو"});
}
public void testReusableTokenStream() throws IOException {
Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesTo(a, "پیاوە", new String[] {"پیاو"});
assertAnalyzesTo(a, "پیاو", new String[] {"پیاو"});
}
public void testWithStemExclusionSet() throws IOException {
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
set.add("پیاوە");
Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
assertAnalyzesTo(a, "پیاوە", new String[] { "پیاوە" });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new SoraniAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
}
}

View File

@ -0,0 +1,92 @@
package org.apache.lucene.analysis.ckb;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/**
* Tests normalization for Sorani (this is more critical than stemming...)
*/
public class TestSoraniNormalizationFilter extends BaseTokenStreamTestCase {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new SoraniNormalizationFilter(tokenizer));
}
};
public void testY() throws Exception {
checkOneTerm(a, "\u064A", "\u06CC");
checkOneTerm(a, "\u0649", "\u06CC");
checkOneTerm(a, "\u06CC", "\u06CC");
}
public void testK() throws Exception {
checkOneTerm(a, "\u0643", "\u06A9");
checkOneTerm(a, "\u06A9", "\u06A9");
}
public void testH() throws Exception {
// initial
checkOneTerm(a, "\u0647\u200C", "\u06D5");
// medial
checkOneTerm(a, "\u0647\u200C\u06A9", "\u06D5\u06A9");
checkOneTerm(a, "\u06BE", "\u0647");
checkOneTerm(a, "\u0629", "\u06D5");
}
public void testFinalH() throws Exception {
// always (and in final form by def), so frequently omitted
checkOneTerm(a, "\u0647\u0647\u0647", "\u0647\u0647\u06D5");
}
public void testRR() throws Exception {
checkOneTerm(a, "\u0692", "\u0695");
}
public void testInitialRR() throws Exception {
// always, so frequently omitted
checkOneTerm(a, "\u0631\u0631\u0631", "\u0695\u0631\u0631");
}
public void testRemove() throws Exception {
checkOneTerm(a, "\u0640", "");
checkOneTerm(a, "\u064B", "");
checkOneTerm(a, "\u064C", "");
checkOneTerm(a, "\u064D", "");
checkOneTerm(a, "\u064E", "");
checkOneTerm(a, "\u064F", "");
checkOneTerm(a, "\u0650", "");
checkOneTerm(a, "\u0651", "");
checkOneTerm(a, "\u0652", "");
// we peek backwards in this case to look for h+200C, ensure this works
checkOneTerm(a, "\u200C", "");
}
public void testEmptyTerm() throws IOException {
checkOneTerm(a, "", "");
}
}

View File

@ -0,0 +1,48 @@
package org.apache.lucene.analysis.ckb;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
/**
* Simple tests to ensure the Sorani normalization factory is working.
*/
public class TestSoraniNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
public void testNormalization() throws Exception {
Reader reader = new StringReader("پیــــاوەکان");
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
stream = tokenFilterFactory("SoraniNormalization").create(stream);
assertTokenStreamContents(stream, new String[] { "پیاوەکان" });
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
try {
tokenFilterFactory("SoraniNormalization", "bogusArg", "bogusValue");
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
}

View File

@ -0,0 +1,100 @@
package org.apache.lucene.analysis.ckb;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/**
* Test the Sorani Stemmer.
*/
public class TestSoraniStemFilter extends BaseTokenStreamTestCase {
SoraniAnalyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT);
public void testIndefiniteSingular() throws Exception {
checkOneTerm(a, "پیاوێک", "پیاو"); // -ek
checkOneTerm(a, "دەرگایەک", "دەرگا"); // -yek
}
public void testDefiniteSingular() throws Exception {
checkOneTerm(a, "پیاوەكە", "پیاو"); // -aka
checkOneTerm(a, "دەرگاكە", "دەرگا"); // -ka
}
public void testDemonstrativeSingular() throws Exception {
checkOneTerm(a, "کتاویە", "کتاوی"); // -a
checkOneTerm(a, "دەرگایە", "دەرگا"); // -ya
}
public void testIndefinitePlural() throws Exception {
checkOneTerm(a, "پیاوان", "پیاو"); // -An
checkOneTerm(a, "دەرگایان", "دەرگا"); // -yAn
}
public void testDefinitePlural() throws Exception {
checkOneTerm(a, "پیاوەکان", "پیاو"); // -akAn
checkOneTerm(a, "دەرگاکان", "دەرگا"); // -kAn
}
public void testDemonstrativePlural() throws Exception {
checkOneTerm(a, "پیاوانە", "پیاو"); // -Ana
checkOneTerm(a, "دەرگایانە", "دەرگا"); // -yAna
}
public void testEzafe() throws Exception {
checkOneTerm(a, "هۆتیلی", "هۆتیل"); // singular
checkOneTerm(a, "هۆتیلێکی", "هۆتیل"); // indefinite
checkOneTerm(a, "هۆتیلانی", "هۆتیل"); // plural
}
public void testPostpositions() throws Exception {
checkOneTerm(a, "دوورەوە", "دوور"); // -awa
checkOneTerm(a, "نیوەشەودا", "نیوەشەو"); // -dA
checkOneTerm(a, "سۆرانا", "سۆران"); // -A
}
public void testPossessives() throws Exception {
checkOneTerm(a, "پارەمان", "پارە"); // -mAn
checkOneTerm(a, "پارەتان", "پارە"); // -tAn
checkOneTerm(a, "پارەیان", "پارە"); // -yAn
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new SoraniStemFilter(tokenizer));
}
};
checkOneTerm(a, "", "");
}
/** test against a basic vocabulary file */
public void testVocabulary() throws Exception {
// top 8k words or so: freq > 1000
assertVocabulary(a, getDataFile("ckbtestdata.zip"), "testdata.txt");
}
}

View File

@ -0,0 +1,48 @@
package org.apache.lucene.analysis.ckb;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
/**
* Simple tests to ensure the Sorani stem factory is working.
*/
public class TestSoraniStemFilterFactory extends BaseTokenStreamFactoryTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("پیاوەکان");
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
stream = tokenFilterFactory("SoraniStem").create(stream);
assertTokenStreamContents(stream, new String[] { "پیاو" });
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
try {
tokenFilterFactory("SoraniStem", "bogusArg", "bogusValue");
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
}

View File

@ -612,7 +612,7 @@
<property name="analysis-common.res.dir" value="../lucene/analysis/common/src/resources/org/apache/lucene/analysis"/> <property name="analysis-common.res.dir" value="../lucene/analysis/common/src/resources/org/apache/lucene/analysis"/>
<property name="analysis-kuromoji.res.dir" value="../lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis"/> <property name="analysis-kuromoji.res.dir" value="../lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis"/>
<property name="analysis.conf.dest" value="${example}/solr/conf/lang"/> <property name="analysis.conf.dest" value="${example}/solr/collection1/conf/lang"/>
<target name="sync-analyzers" <target name="sync-analyzers"
description="Committers' Helper: synchronizes analysis resources (e.g. stoplists) to the example"> description="Committers' Helper: synchronizes analysis resources (e.g. stoplists) to the example">
@ -625,6 +625,9 @@
<!-- catalan --> <!-- catalan -->
<copy verbose="true" file="${analysis-common.res.dir}/ca/stopwords.txt" <copy verbose="true" file="${analysis-common.res.dir}/ca/stopwords.txt"
tofile="${analysis.conf.dest}/stopwords_ca.txt"/> tofile="${analysis.conf.dest}/stopwords_ca.txt"/>
<!-- kurdish -->
<copy verbose="true" file="${analysis-common.res.dir}/ckb/stopwords.txt"
tofile="${analysis.conf.dest}/stopwords_ckb.txt"/>
<!-- czech --> <!-- czech -->
<copy verbose="true" file="${analysis-common.res.dir}/cz/stopwords.txt" <copy verbose="true" file="${analysis-common.res.dir}/cz/stopwords.txt"
tofile="${analysis.conf.dest}/stopwords_cz.txt"/> tofile="${analysis.conf.dest}/stopwords_cz.txt"/>

View File

@ -0,0 +1,136 @@
# set of kurdish stopwords
# note these have been normalized with our scheme (e represented with U+06D5, etc)
# constructed from:
# * Fig 5 of "Building A Test Collection For Sorani Kurdish" (Esmaili et al)
# * "Sorani Kurdish: A Reference Grammar with selected readings" (Thackston)
# * Corpus-based analysis of 77M word Sorani collection: wikipedia, news, blogs, etc
# and
و
# which
کە
# of
ی
# made/did
کرد
# that/which
ئەوەی
# on/head
سەر
# two
دوو
# also
هەروەها
# from/that
لەو
# makes/does
دەکات
# some
چەند
# every
هەر
# demonstratives
# that
ئەو
# this
ئەم
# personal pronouns
# I
من
# we
ئێمە
# you
تۆ
# you
ئێوە
# he/she/it
ئەو
# they
ئەوان
# prepositions
# to/with/by
بە
پێ
# without
بەبێ
# along with/while/during
بەدەم
# in the opinion of
بەلای
# according to
بەپێی
# before
بەرلە
# in the direction of
بەرەوی
# in front of/toward
بەرەوە
# before/in the face of
بەردەم
# without
بێ
# except for
بێجگە
# for
بۆ
# on/in
دە
تێ
# with
دەگەڵ
# after
دوای
# except for/aside from
جگە
# in/from
لە
لێ
# in front of/before/because of
لەبەر
# between/among
لەبەینی
# concerning/about
لەبابەت
# concerning
لەبارەی
# instead of
لەباتی
# beside
لەبن
# instead of
لەبرێتی
# behind
لەدەم
# with/together with
لەگەڵ
# by
لەلایەن
# within
لەناو
# between/among
لەنێو
# for the sake of
لەپێناوی
# with respect to
لەرەوی
# by means of/for
لەرێ
# for the sake of
لەرێگا
# on/on top of/according to
لەسەر
# under
لەژێر
# between/among
ناو
# between/among
نێوان
# after
پاش
# before
پێش
# like
وەک

View File

@ -779,6 +779,18 @@
</analyzer> </analyzer>
</fieldType> </fieldType>
<!-- Kurdish -->
<fieldType name="text_ckb" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.SoraniNormalizationFilterFactory"/>
<!-- for any latin text -->
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ckb.txt"/>
<filter class="solr.SoraniStemFilterFactory"/>
</analyzer>
</fieldType>
<!-- Czech --> <!-- Czech -->
<fieldType name="text_cz" class="solr.TextField" positionIncrementGap="100"> <fieldType name="text_cz" class="solr.TextField" positionIncrementGap="100">
<analyzer> <analyzer>