mirror of https://github.com/apache/lucene.git
LUCENE-5379: Kurdish Analyzer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1555359 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9d0b60388d
commit
2140f4368a
|
@ -81,6 +81,8 @@ New Features
|
|||
matter in practice if the number of ranges is over 10 or so. (Mike
|
||||
McCandless)
|
||||
|
||||
* LUCENE-5379: Add Analyzer for Kurdish. (Robert Muir)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-5217: Maven config: get dependencies from Ant+Ivy config; disable
|
||||
|
|
|
@ -0,0 +1,130 @@
|
|||
package org.apache.lucene.analysis.ckb;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Sorani Kurdish.
|
||||
*/
|
||||
public final class SoraniAnalyzer extends StopwordAnalyzerBase {
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/** File containing default Kurdish stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static CharArraySet getDefaultStopSet() {
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(SoraniAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public SoraniAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public SoraniAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public SoraniAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* which tokenizes all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return A
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link SoraniNormalizationFilter},
|
||||
* {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link SoraniStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new SoraniNormalizationFilter(result);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SoraniStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
package org.apache.lucene.analysis.ckb;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that applies {@link SoraniNormalizer} to normalize the
|
||||
* orthography.
|
||||
*/
|
||||
public final class SoraniNormalizationFilter extends TokenFilter {
|
||||
private final SoraniNormalizer normalizer = new SoraniNormalizer();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
public SoraniNormalizationFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
final int newlen = normalizer.normalize(termAtt.buffer(), termAtt.length());
|
||||
termAtt.setLength(newlen);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
package org.apache.lucene.analysis.ckb;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link SoraniNormalizationFilter}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_ckbnormal" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.SoraniNormalizationFilterFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class SoraniNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
/** Creates a new SoraniNormalizationFilterFactory */
|
||||
public SoraniNormalizationFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public SoraniNormalizationFilter create(TokenStream input) {
|
||||
return new SoraniNormalizationFilter(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,127 @@
|
|||
package org.apache.lucene.analysis.ckb;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import static org.apache.lucene.analysis.util.StemmerUtil.delete;
|
||||
|
||||
/**
|
||||
* Normalizes the Unicode representation of Sorani text.
|
||||
* <p>
|
||||
* Normalization consists of:
|
||||
* <ul>
|
||||
* <li>Alternate forms of 'y' (0064, 0649) are converted to 06CC (FARSI YEH)
|
||||
* <li>Alternate form of 'k' (0643) is converted to 06A9 (KEHEH)
|
||||
* <li>Alternate forms of vowel 'e' (0647+200C, word-final 0647, 0629) are converted to 06D5 (AE)
|
||||
* <li>Alternate (joining) form of 'h' (06BE) is converted to 0647
|
||||
* <li>Alternate forms of 'rr' (0692, word-initial 0631) are converted to 0695 (REH WITH SMALL V BELOW)
|
||||
* <li>Harakat, tatweel, and formatting characters such as directional controls are removed.
|
||||
* </ul>
|
||||
*/
|
||||
public class SoraniNormalizer {
|
||||
|
||||
static final char YEH = '\u064A';
|
||||
static final char DOTLESS_YEH = '\u0649';
|
||||
static final char FARSI_YEH = '\u06CC';
|
||||
|
||||
static final char KAF = '\u0643';
|
||||
static final char KEHEH = '\u06A9';
|
||||
|
||||
static final char HEH = '\u0647';
|
||||
static final char AE = '\u06D5';
|
||||
static final char ZWNJ = '\u200C';
|
||||
static final char HEH_DOACHASHMEE = '\u06BE';
|
||||
static final char TEH_MARBUTA = '\u0629';
|
||||
|
||||
static final char REH = '\u0631';
|
||||
static final char RREH = '\u0695';
|
||||
static final char RREH_ABOVE = '\u0692';
|
||||
|
||||
static final char TATWEEL = '\u0640';
|
||||
static final char FATHATAN = '\u064B';
|
||||
static final char DAMMATAN = '\u064C';
|
||||
static final char KASRATAN = '\u064D';
|
||||
static final char FATHA = '\u064E';
|
||||
static final char DAMMA = '\u064F';
|
||||
static final char KASRA = '\u0650';
|
||||
static final char SHADDA = '\u0651';
|
||||
static final char SUKUN = '\u0652';
|
||||
|
||||
/**
|
||||
* Normalize an input buffer of Sorani text
|
||||
*
|
||||
* @param s input buffer
|
||||
* @param len length of input buffer
|
||||
* @return length of input buffer after normalization
|
||||
*/
|
||||
public int normalize(char s[], int len) {
|
||||
for (int i = 0; i < len; i++) {
|
||||
switch (s[i]) {
|
||||
case YEH:
|
||||
case DOTLESS_YEH:
|
||||
s[i] = FARSI_YEH;
|
||||
break;
|
||||
case KAF:
|
||||
s[i] = KEHEH;
|
||||
break;
|
||||
case ZWNJ:
|
||||
if (i > 0 && s[i-1] == HEH) {
|
||||
s[i-1] = AE;
|
||||
}
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
break;
|
||||
case HEH:
|
||||
if (i == len-1) {
|
||||
s[i] = AE;
|
||||
}
|
||||
break;
|
||||
case TEH_MARBUTA:
|
||||
s[i] = AE;
|
||||
break;
|
||||
case HEH_DOACHASHMEE:
|
||||
s[i] = HEH;
|
||||
break;
|
||||
case REH:
|
||||
if (i == 0) {
|
||||
s[i] = RREH;
|
||||
}
|
||||
break;
|
||||
case RREH_ABOVE:
|
||||
s[i] = RREH;
|
||||
break;
|
||||
case TATWEEL:
|
||||
case KASRATAN:
|
||||
case DAMMATAN:
|
||||
case FATHATAN:
|
||||
case FATHA:
|
||||
case DAMMA:
|
||||
case KASRA:
|
||||
case SHADDA:
|
||||
case SUKUN:
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
break;
|
||||
default:
|
||||
if (Character.getType(s[i]) == Character.FORMAT) {
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
}
|
||||
return len;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
package org.apache.lucene.analysis.ckb;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // javadoc @link
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that applies {@link SoraniStemmer} to stem Sorani words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
* @see SetKeywordMarkerFilter */
|
||||
|
||||
public final class SoraniStemFilter extends TokenFilter {
|
||||
private final SoraniStemmer stemmer = new SoraniStemmer();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
|
||||
public SoraniStemFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if(!keywordAttr.isKeyword()) {
|
||||
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||
termAtt.setLength(newlen);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
package org.apache.lucene.analysis.ckb;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link SoraniStemFilter}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_ckbstem" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.SoraniNormalizationFilterFactory"/>
|
||||
* <filter class="solr.SoraniStemFilterFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class SoraniStemFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** Creates a new SoraniStemFilterFactory */
|
||||
public SoraniStemFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public SoraniStemFilter create(TokenStream input) {
|
||||
return new SoraniStemFilter(input);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,103 @@
|
|||
package org.apache.lucene.analysis.ckb;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
|
||||
|
||||
/**
|
||||
* Light stemmer for Sorani
|
||||
*/
|
||||
public class SoraniStemmer {
|
||||
|
||||
/**
|
||||
* Stem an input buffer of Sorani text.
|
||||
*
|
||||
* @param s input buffer
|
||||
* @param len length of input buffer
|
||||
* @return length of input buffer after normalization
|
||||
*/
|
||||
public int stem(char s[], int len) {
|
||||
// postposition
|
||||
if (len > 5 && endsWith(s, len, "دا")) {
|
||||
len -= 2;
|
||||
} else if (len > 4 && endsWith(s, len, "نا")) {
|
||||
len--;
|
||||
} else if (len > 6 && endsWith(s, len, "ەوە")) {
|
||||
len -= 3;
|
||||
}
|
||||
|
||||
// possessive pronoun
|
||||
if (len > 6 && (endsWith(s, len, "مان") || endsWith(s, len, "یان") || endsWith(s, len, "تان"))) {
|
||||
len -= 3;
|
||||
}
|
||||
|
||||
// indefinite singular ezafe
|
||||
if (len > 6 && endsWith(s, len, "ێکی")) {
|
||||
return len-3;
|
||||
} else if (len > 7 && endsWith(s, len, "یەکی")) {
|
||||
return len-4;
|
||||
}
|
||||
// indefinite singular
|
||||
if (len > 5 && endsWith(s, len, "ێک")) {
|
||||
return len-2;
|
||||
} else if (len > 6 && endsWith(s, len, "یەک")) {
|
||||
return len-3;
|
||||
}
|
||||
// definite singular
|
||||
else if (len > 6 && endsWith(s, len, "ەکە")) {
|
||||
return len-3;
|
||||
} else if (len > 5 && endsWith(s, len, "کە")) {
|
||||
return len-2;
|
||||
}
|
||||
// definite plural
|
||||
else if (len > 7 && endsWith(s, len, "ەکان")) {
|
||||
return len-4;
|
||||
} else if (len > 6 && endsWith(s, len, "کان")) {
|
||||
return len-3;
|
||||
}
|
||||
// indefinite plural ezafe
|
||||
else if (len > 7 && endsWith(s, len, "یانی")) {
|
||||
return len-4;
|
||||
} else if (len > 6 && endsWith(s, len, "انی")) {
|
||||
return len-3;
|
||||
}
|
||||
// indefinite plural
|
||||
else if (len > 6 && endsWith(s, len, "یان")) {
|
||||
return len-3;
|
||||
} else if (len > 5 && endsWith(s, len, "ان")) {
|
||||
return len-2;
|
||||
}
|
||||
// demonstrative plural
|
||||
else if (len > 7 && endsWith(s, len, "یانە")) {
|
||||
return len-4;
|
||||
} else if (len > 6 && endsWith(s, len, "انە")) {
|
||||
return len-3;
|
||||
}
|
||||
// demonstrative singular
|
||||
else if (len > 5 && (endsWith(s, len, "ایە") || endsWith(s, len, "ەیە"))) {
|
||||
return len-2;
|
||||
} else if (len > 4 && endsWith(s, len, "ە")) {
|
||||
return len-1;
|
||||
}
|
||||
// absolute singular ezafe
|
||||
else if (len > 4 && endsWith(s, len, "ی")) {
|
||||
return len-1;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Analyzer for Sorani Kurdish.
|
||||
</body>
|
||||
</html>
|
|
@ -19,6 +19,8 @@ org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
|
|||
org.apache.lucene.analysis.br.BrazilianStemFilterFactory
|
||||
org.apache.lucene.analysis.cjk.CJKBigramFilterFactory
|
||||
org.apache.lucene.analysis.cjk.CJKWidthFilterFactory
|
||||
org.apache.lucene.analysis.ckb.SoraniNormalizationFilterFactory
|
||||
org.apache.lucene.analysis.ckb.SoraniStemFilterFactory
|
||||
org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory
|
||||
org.apache.lucene.analysis.commongrams.CommonGramsQueryFilterFactory
|
||||
org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilterFactory
|
||||
|
|
|
@ -0,0 +1,136 @@
|
|||
# set of kurdish stopwords
|
||||
# note these have been normalized with our scheme (e represented with U+06D5, etc)
|
||||
# constructed from:
|
||||
# * Fig 5 of "Building A Test Collection For Sorani Kurdish" (Esmaili et al)
|
||||
# * "Sorani Kurdish: A Reference Grammar with selected readings" (Thackston)
|
||||
# * Corpus-based analysis of 77M word Sorani collection: wikipedia, news, blogs, etc
|
||||
|
||||
# and
|
||||
و
|
||||
# which
|
||||
کە
|
||||
# of
|
||||
ی
|
||||
# made/did
|
||||
کرد
|
||||
# that/which
|
||||
ئەوەی
|
||||
# on/head
|
||||
سەر
|
||||
# two
|
||||
دوو
|
||||
# also
|
||||
هەروەها
|
||||
# from/that
|
||||
لەو
|
||||
# makes/does
|
||||
دەکات
|
||||
# some
|
||||
چەند
|
||||
# every
|
||||
هەر
|
||||
|
||||
# demonstratives
|
||||
# that
|
||||
ئەو
|
||||
# this
|
||||
ئەم
|
||||
|
||||
# personal pronouns
|
||||
# I
|
||||
من
|
||||
# we
|
||||
ئێمە
|
||||
# you
|
||||
تۆ
|
||||
# you
|
||||
ئێوە
|
||||
# he/she/it
|
||||
ئەو
|
||||
# they
|
||||
ئەوان
|
||||
|
||||
# prepositions
|
||||
# to/with/by
|
||||
بە
|
||||
پێ
|
||||
# without
|
||||
بەبێ
|
||||
# along with/while/during
|
||||
بەدەم
|
||||
# in the opinion of
|
||||
بەلای
|
||||
# according to
|
||||
بەپێی
|
||||
# before
|
||||
بەرلە
|
||||
# in the direction of
|
||||
بەرەوی
|
||||
# in front of/toward
|
||||
بەرەوە
|
||||
# before/in the face of
|
||||
بەردەم
|
||||
# without
|
||||
بێ
|
||||
# except for
|
||||
بێجگە
|
||||
# for
|
||||
بۆ
|
||||
# on/in
|
||||
دە
|
||||
تێ
|
||||
# with
|
||||
دەگەڵ
|
||||
# after
|
||||
دوای
|
||||
# except for/aside from
|
||||
جگە
|
||||
# in/from
|
||||
لە
|
||||
لێ
|
||||
# in front of/before/because of
|
||||
لەبەر
|
||||
# between/among
|
||||
لەبەینی
|
||||
# concerning/about
|
||||
لەبابەت
|
||||
# concerning
|
||||
لەبارەی
|
||||
# instead of
|
||||
لەباتی
|
||||
# beside
|
||||
لەبن
|
||||
# instead of
|
||||
لەبرێتی
|
||||
# behind
|
||||
لەدەم
|
||||
# with/together with
|
||||
لەگەڵ
|
||||
# by
|
||||
لەلایەن
|
||||
# within
|
||||
لەناو
|
||||
# between/among
|
||||
لەنێو
|
||||
# for the sake of
|
||||
لەپێناوی
|
||||
# with respect to
|
||||
لەرەوی
|
||||
# by means of/for
|
||||
لەرێ
|
||||
# for the sake of
|
||||
لەرێگا
|
||||
# on/on top of/according to
|
||||
لەسەر
|
||||
# under
|
||||
لەژێر
|
||||
# between/among
|
||||
ناو
|
||||
# between/among
|
||||
نێوان
|
||||
# after
|
||||
پاش
|
||||
# before
|
||||
پێش
|
||||
# like
|
||||
وەک
|
|
@ -0,0 +1,66 @@
|
|||
package org.apache.lucene.analysis.ckb;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/**
|
||||
* Test the Sorani analyzer
|
||||
*/
|
||||
public class TestSoraniAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
||||
/**
|
||||
* This test fails with NPE when the stopwords file is missing in classpath
|
||||
*/
|
||||
public void testResourcesAvailable() {
|
||||
new SoraniAnalyzer(TEST_VERSION_CURRENT);
|
||||
}
|
||||
|
||||
public void testStopwords() throws IOException {
|
||||
Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesTo(a, "ئەم پیاوە", new String[] {"پیاو"});
|
||||
}
|
||||
|
||||
public void testCustomStopwords() throws IOException {
|
||||
Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
|
||||
assertAnalyzesTo(a, "ئەم پیاوە",
|
||||
new String[] {"ئەم", "پیاو"});
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws IOException {
|
||||
Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesTo(a, "پیاوە", new String[] {"پیاو"});
|
||||
assertAnalyzesTo(a, "پیاو", new String[] {"پیاو"});
|
||||
}
|
||||
|
||||
public void testWithStemExclusionSet() throws IOException {
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||
set.add("پیاوە");
|
||||
Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
||||
assertAnalyzesTo(a, "پیاوە", new String[] { "پیاوە" });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new SoraniAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
package org.apache.lucene.analysis.ckb;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
|
||||
/**
|
||||
* Tests normalization for Sorani (this is more critical than stemming...)
|
||||
*/
|
||||
public class TestSoraniNormalizationFilter extends BaseTokenStreamTestCase {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer, new SoraniNormalizationFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
|
||||
public void testY() throws Exception {
|
||||
checkOneTerm(a, "\u064A", "\u06CC");
|
||||
checkOneTerm(a, "\u0649", "\u06CC");
|
||||
checkOneTerm(a, "\u06CC", "\u06CC");
|
||||
}
|
||||
|
||||
public void testK() throws Exception {
|
||||
checkOneTerm(a, "\u0643", "\u06A9");
|
||||
checkOneTerm(a, "\u06A9", "\u06A9");
|
||||
}
|
||||
|
||||
public void testH() throws Exception {
|
||||
// initial
|
||||
checkOneTerm(a, "\u0647\u200C", "\u06D5");
|
||||
// medial
|
||||
checkOneTerm(a, "\u0647\u200C\u06A9", "\u06D5\u06A9");
|
||||
|
||||
checkOneTerm(a, "\u06BE", "\u0647");
|
||||
checkOneTerm(a, "\u0629", "\u06D5");
|
||||
}
|
||||
|
||||
public void testFinalH() throws Exception {
|
||||
// always (and in final form by def), so frequently omitted
|
||||
checkOneTerm(a, "\u0647\u0647\u0647", "\u0647\u0647\u06D5");
|
||||
}
|
||||
|
||||
public void testRR() throws Exception {
|
||||
checkOneTerm(a, "\u0692", "\u0695");
|
||||
}
|
||||
|
||||
public void testInitialRR() throws Exception {
|
||||
// always, so frequently omitted
|
||||
checkOneTerm(a, "\u0631\u0631\u0631", "\u0695\u0631\u0631");
|
||||
}
|
||||
|
||||
public void testRemove() throws Exception {
|
||||
checkOneTerm(a, "\u0640", "");
|
||||
checkOneTerm(a, "\u064B", "");
|
||||
checkOneTerm(a, "\u064C", "");
|
||||
checkOneTerm(a, "\u064D", "");
|
||||
checkOneTerm(a, "\u064E", "");
|
||||
checkOneTerm(a, "\u064F", "");
|
||||
checkOneTerm(a, "\u0650", "");
|
||||
checkOneTerm(a, "\u0651", "");
|
||||
checkOneTerm(a, "\u0652", "");
|
||||
// we peek backwards in this case to look for h+200C, ensure this works
|
||||
checkOneTerm(a, "\u200C", "");
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
package org.apache.lucene.analysis.ckb;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Sorani normalization factory is working.
|
||||
*/
|
||||
public class TestSoraniNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
|
||||
public void testNormalization() throws Exception {
|
||||
Reader reader = new StringReader("پیــــاوەکان");
|
||||
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
stream = tokenFilterFactory("SoraniNormalization").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "پیاوەکان" });
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
try {
|
||||
tokenFilterFactory("SoraniNormalization", "bogusArg", "bogusValue");
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,100 @@
|
|||
package org.apache.lucene.analysis.ckb;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
|
||||
/**
|
||||
* Test the Sorani Stemmer.
|
||||
*/
|
||||
public class TestSoraniStemFilter extends BaseTokenStreamTestCase {
|
||||
SoraniAnalyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT);
|
||||
|
||||
public void testIndefiniteSingular() throws Exception {
|
||||
checkOneTerm(a, "پیاوێک", "پیاو"); // -ek
|
||||
checkOneTerm(a, "دەرگایەک", "دەرگا"); // -yek
|
||||
}
|
||||
|
||||
public void testDefiniteSingular() throws Exception {
|
||||
checkOneTerm(a, "پیاوەكە", "پیاو"); // -aka
|
||||
checkOneTerm(a, "دەرگاكە", "دەرگا"); // -ka
|
||||
}
|
||||
|
||||
public void testDemonstrativeSingular() throws Exception {
|
||||
checkOneTerm(a, "کتاویە", "کتاوی"); // -a
|
||||
checkOneTerm(a, "دەرگایە", "دەرگا"); // -ya
|
||||
}
|
||||
|
||||
public void testIndefinitePlural() throws Exception {
|
||||
checkOneTerm(a, "پیاوان", "پیاو"); // -An
|
||||
checkOneTerm(a, "دەرگایان", "دەرگا"); // -yAn
|
||||
}
|
||||
|
||||
public void testDefinitePlural() throws Exception {
|
||||
checkOneTerm(a, "پیاوەکان", "پیاو"); // -akAn
|
||||
checkOneTerm(a, "دەرگاکان", "دەرگا"); // -kAn
|
||||
}
|
||||
|
||||
public void testDemonstrativePlural() throws Exception {
|
||||
checkOneTerm(a, "پیاوانە", "پیاو"); // -Ana
|
||||
checkOneTerm(a, "دەرگایانە", "دەرگا"); // -yAna
|
||||
}
|
||||
|
||||
public void testEzafe() throws Exception {
|
||||
checkOneTerm(a, "هۆتیلی", "هۆتیل"); // singular
|
||||
checkOneTerm(a, "هۆتیلێکی", "هۆتیل"); // indefinite
|
||||
checkOneTerm(a, "هۆتیلانی", "هۆتیل"); // plural
|
||||
}
|
||||
|
||||
public void testPostpositions() throws Exception {
|
||||
checkOneTerm(a, "دوورەوە", "دوور"); // -awa
|
||||
checkOneTerm(a, "نیوەشەودا", "نیوەشەو"); // -dA
|
||||
checkOneTerm(a, "سۆرانا", "سۆران"); // -A
|
||||
}
|
||||
|
||||
public void testPossessives() throws Exception {
|
||||
checkOneTerm(a, "پارەمان", "پارە"); // -mAn
|
||||
checkOneTerm(a, "پارەتان", "پارە"); // -tAn
|
||||
checkOneTerm(a, "پارەیان", "پارە"); // -yAn
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer, new SoraniStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
|
||||
/** test against a basic vocabulary file */
|
||||
public void testVocabulary() throws Exception {
|
||||
// top 8k words or so: freq > 1000
|
||||
assertVocabulary(a, getDataFile("ckbtestdata.zip"), "testdata.txt");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
package org.apache.lucene.analysis.ckb;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Sorani stem factory is working.
|
||||
*/
|
||||
public class TestSoraniStemFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("پیاوەکان");
|
||||
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
stream = tokenFilterFactory("SoraniStem").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "پیاو" });
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
try {
|
||||
tokenFilterFactory("SoraniStem", "bogusArg", "bogusValue");
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
|
@ -612,7 +612,7 @@
|
|||
|
||||
<property name="analysis-common.res.dir" value="../lucene/analysis/common/src/resources/org/apache/lucene/analysis"/>
|
||||
<property name="analysis-kuromoji.res.dir" value="../lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis"/>
|
||||
<property name="analysis.conf.dest" value="${example}/solr/conf/lang"/>
|
||||
<property name="analysis.conf.dest" value="${example}/solr/collection1/conf/lang"/>
|
||||
|
||||
<target name="sync-analyzers"
|
||||
description="Committers' Helper: synchronizes analysis resources (e.g. stoplists) to the example">
|
||||
|
@ -625,6 +625,9 @@
|
|||
<!-- catalan -->
|
||||
<copy verbose="true" file="${analysis-common.res.dir}/ca/stopwords.txt"
|
||||
tofile="${analysis.conf.dest}/stopwords_ca.txt"/>
|
||||
<!-- kurdish -->
|
||||
<copy verbose="true" file="${analysis-common.res.dir}/ckb/stopwords.txt"
|
||||
tofile="${analysis.conf.dest}/stopwords_ckb.txt"/>
|
||||
<!-- czech -->
|
||||
<copy verbose="true" file="${analysis-common.res.dir}/cz/stopwords.txt"
|
||||
tofile="${analysis.conf.dest}/stopwords_cz.txt"/>
|
||||
|
|
|
@ -0,0 +1,136 @@
|
|||
# set of kurdish stopwords
|
||||
# note these have been normalized with our scheme (e represented with U+06D5, etc)
|
||||
# constructed from:
|
||||
# * Fig 5 of "Building A Test Collection For Sorani Kurdish" (Esmaili et al)
|
||||
# * "Sorani Kurdish: A Reference Grammar with selected readings" (Thackston)
|
||||
# * Corpus-based analysis of 77M word Sorani collection: wikipedia, news, blogs, etc
|
||||
|
||||
# and
|
||||
و
|
||||
# which
|
||||
کە
|
||||
# of
|
||||
ی
|
||||
# made/did
|
||||
کرد
|
||||
# that/which
|
||||
ئەوەی
|
||||
# on/head
|
||||
سەر
|
||||
# two
|
||||
دوو
|
||||
# also
|
||||
هەروەها
|
||||
# from/that
|
||||
لەو
|
||||
# makes/does
|
||||
دەکات
|
||||
# some
|
||||
چەند
|
||||
# every
|
||||
هەر
|
||||
|
||||
# demonstratives
|
||||
# that
|
||||
ئەو
|
||||
# this
|
||||
ئەم
|
||||
|
||||
# personal pronouns
|
||||
# I
|
||||
من
|
||||
# we
|
||||
ئێمە
|
||||
# you
|
||||
تۆ
|
||||
# you
|
||||
ئێوە
|
||||
# he/she/it
|
||||
ئەو
|
||||
# they
|
||||
ئەوان
|
||||
|
||||
# prepositions
|
||||
# to/with/by
|
||||
بە
|
||||
پێ
|
||||
# without
|
||||
بەبێ
|
||||
# along with/while/during
|
||||
بەدەم
|
||||
# in the opinion of
|
||||
بەلای
|
||||
# according to
|
||||
بەپێی
|
||||
# before
|
||||
بەرلە
|
||||
# in the direction of
|
||||
بەرەوی
|
||||
# in front of/toward
|
||||
بەرەوە
|
||||
# before/in the face of
|
||||
بەردەم
|
||||
# without
|
||||
بێ
|
||||
# except for
|
||||
بێجگە
|
||||
# for
|
||||
بۆ
|
||||
# on/in
|
||||
دە
|
||||
تێ
|
||||
# with
|
||||
دەگەڵ
|
||||
# after
|
||||
دوای
|
||||
# except for/aside from
|
||||
جگە
|
||||
# in/from
|
||||
لە
|
||||
لێ
|
||||
# in front of/before/because of
|
||||
لەبەر
|
||||
# between/among
|
||||
لەبەینی
|
||||
# concerning/about
|
||||
لەبابەت
|
||||
# concerning
|
||||
لەبارەی
|
||||
# instead of
|
||||
لەباتی
|
||||
# beside
|
||||
لەبن
|
||||
# instead of
|
||||
لەبرێتی
|
||||
# behind
|
||||
لەدەم
|
||||
# with/together with
|
||||
لەگەڵ
|
||||
# by
|
||||
لەلایەن
|
||||
# within
|
||||
لەناو
|
||||
# between/among
|
||||
لەنێو
|
||||
# for the sake of
|
||||
لەپێناوی
|
||||
# with respect to
|
||||
لەرەوی
|
||||
# by means of/for
|
||||
لەرێ
|
||||
# for the sake of
|
||||
لەرێگا
|
||||
# on/on top of/according to
|
||||
لەسەر
|
||||
# under
|
||||
لەژێر
|
||||
# between/among
|
||||
ناو
|
||||
# between/among
|
||||
نێوان
|
||||
# after
|
||||
پاش
|
||||
# before
|
||||
پێش
|
||||
# like
|
||||
وەک
|
|
@ -779,6 +779,18 @@
|
|||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- Kurdish -->
|
||||
<fieldType name="text_ckb" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.SoraniNormalizationFilterFactory"/>
|
||||
<!-- for any latin text -->
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ckb.txt"/>
|
||||
<filter class="solr.SoraniStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- Czech -->
|
||||
<fieldType name="text_cz" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
|
|
Loading…
Reference in New Issue