mirror of https://github.com/apache/lucene.git
LUCENE-5379: Kurdish Analyzer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1555359 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9d0b60388d
commit
2140f4368a
|
@ -81,6 +81,8 @@ New Features
|
||||||
matter in practice if the number of ranges is over 10 or so. (Mike
|
matter in practice if the number of ranges is over 10 or so. (Mike
|
||||||
McCandless)
|
McCandless)
|
||||||
|
|
||||||
|
* LUCENE-5379: Add Analyzer for Kurdish. (Robert Muir)
|
||||||
|
|
||||||
Build
|
Build
|
||||||
|
|
||||||
* LUCENE-5217: Maven config: get dependencies from Ant+Ivy config; disable
|
* LUCENE-5217: Maven config: get dependencies from Ant+Ivy config; disable
|
||||||
|
|
|
@ -0,0 +1,130 @@
|
||||||
|
package org.apache.lucene.analysis.ckb;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.core.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||||
|
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link Analyzer} for Sorani Kurdish.
|
||||||
|
*/
|
||||||
|
public final class SoraniAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
|
/** File containing default Kurdish stopwords. */
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
|
* @return default stop words set.
|
||||||
|
*/
|
||||||
|
public static CharArraySet getDefaultStopSet() {
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||||
|
* accesses the static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(SoraniAnalyzer.class,
|
||||||
|
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||||
|
*/
|
||||||
|
public SoraniAnalyzer(Version matchVersion) {
|
||||||
|
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
*/
|
||||||
|
public SoraniAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||||
|
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||||
|
* stemming.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
|
*/
|
||||||
|
public SoraniAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
|
super(matchVersion, stopwords);
|
||||||
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
|
matchVersion, stemExclusionSet));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a
|
||||||
|
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||||
|
* which tokenizes all the text in the provided {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return A
|
||||||
|
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||||
|
* built from an {@link StandardTokenizer} filtered with
|
||||||
|
* {@link StandardFilter}, {@link SoraniNormalizationFilter},
|
||||||
|
* {@link LowerCaseFilter}, {@link StopFilter}
|
||||||
|
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||||
|
* provided and {@link SoraniStemFilter}.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||||
|
TokenStream result = new StandardFilter(matchVersion, source);
|
||||||
|
result = new SoraniNormalizationFilter(result);
|
||||||
|
result = new LowerCaseFilter(matchVersion, result);
|
||||||
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
|
if(!stemExclusionSet.isEmpty())
|
||||||
|
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||||
|
result = new SoraniStemFilter(result);
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,47 @@
|
||||||
|
package org.apache.lucene.analysis.ckb;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link SoraniNormalizer} to normalize the
|
||||||
|
* orthography.
|
||||||
|
*/
|
||||||
|
public final class SoraniNormalizationFilter extends TokenFilter {
|
||||||
|
private final SoraniNormalizer normalizer = new SoraniNormalizer();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
|
public SoraniNormalizationFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
final int newlen = normalizer.normalize(termAtt.buffer(), termAtt.length());
|
||||||
|
termAtt.setLength(newlen);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,56 @@
|
||||||
|
package org.apache.lucene.analysis.ckb;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||||
|
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||||
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link SoraniNormalizationFilter}.
|
||||||
|
* <pre class="prettyprint">
|
||||||
|
* <fieldType name="text_ckbnormal" class="solr.TextField" positionIncrementGap="100">
|
||||||
|
* <analyzer>
|
||||||
|
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
|
* <filter class="solr.SoraniNormalizationFilterFactory"/>
|
||||||
|
* </analyzer>
|
||||||
|
* </fieldType></pre>
|
||||||
|
*/
|
||||||
|
public class SoraniNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||||
|
|
||||||
|
/** Creates a new SoraniNormalizationFilterFactory */
|
||||||
|
public SoraniNormalizationFilterFactory(Map<String,String> args) {
|
||||||
|
super(args);
|
||||||
|
if (!args.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SoraniNormalizationFilter create(TokenStream input) {
|
||||||
|
return new SoraniNormalizationFilter(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,127 @@
|
||||||
|
package org.apache.lucene.analysis.ckb;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.delete;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalizes the Unicode representation of Sorani text.
|
||||||
|
* <p>
|
||||||
|
* Normalization consists of:
|
||||||
|
* <ul>
|
||||||
|
* <li>Alternate forms of 'y' (0064, 0649) are converted to 06CC (FARSI YEH)
|
||||||
|
* <li>Alternate form of 'k' (0643) is converted to 06A9 (KEHEH)
|
||||||
|
* <li>Alternate forms of vowel 'e' (0647+200C, word-final 0647, 0629) are converted to 06D5 (AE)
|
||||||
|
* <li>Alternate (joining) form of 'h' (06BE) is converted to 0647
|
||||||
|
* <li>Alternate forms of 'rr' (0692, word-initial 0631) are converted to 0695 (REH WITH SMALL V BELOW)
|
||||||
|
* <li>Harakat, tatweel, and formatting characters such as directional controls are removed.
|
||||||
|
* </ul>
|
||||||
|
*/
|
||||||
|
public class SoraniNormalizer {
|
||||||
|
|
||||||
|
static final char YEH = '\u064A';
|
||||||
|
static final char DOTLESS_YEH = '\u0649';
|
||||||
|
static final char FARSI_YEH = '\u06CC';
|
||||||
|
|
||||||
|
static final char KAF = '\u0643';
|
||||||
|
static final char KEHEH = '\u06A9';
|
||||||
|
|
||||||
|
static final char HEH = '\u0647';
|
||||||
|
static final char AE = '\u06D5';
|
||||||
|
static final char ZWNJ = '\u200C';
|
||||||
|
static final char HEH_DOACHASHMEE = '\u06BE';
|
||||||
|
static final char TEH_MARBUTA = '\u0629';
|
||||||
|
|
||||||
|
static final char REH = '\u0631';
|
||||||
|
static final char RREH = '\u0695';
|
||||||
|
static final char RREH_ABOVE = '\u0692';
|
||||||
|
|
||||||
|
static final char TATWEEL = '\u0640';
|
||||||
|
static final char FATHATAN = '\u064B';
|
||||||
|
static final char DAMMATAN = '\u064C';
|
||||||
|
static final char KASRATAN = '\u064D';
|
||||||
|
static final char FATHA = '\u064E';
|
||||||
|
static final char DAMMA = '\u064F';
|
||||||
|
static final char KASRA = '\u0650';
|
||||||
|
static final char SHADDA = '\u0651';
|
||||||
|
static final char SUKUN = '\u0652';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize an input buffer of Sorani text
|
||||||
|
*
|
||||||
|
* @param s input buffer
|
||||||
|
* @param len length of input buffer
|
||||||
|
* @return length of input buffer after normalization
|
||||||
|
*/
|
||||||
|
public int normalize(char s[], int len) {
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
switch (s[i]) {
|
||||||
|
case YEH:
|
||||||
|
case DOTLESS_YEH:
|
||||||
|
s[i] = FARSI_YEH;
|
||||||
|
break;
|
||||||
|
case KAF:
|
||||||
|
s[i] = KEHEH;
|
||||||
|
break;
|
||||||
|
case ZWNJ:
|
||||||
|
if (i > 0 && s[i-1] == HEH) {
|
||||||
|
s[i-1] = AE;
|
||||||
|
}
|
||||||
|
len = delete(s, i, len);
|
||||||
|
i--;
|
||||||
|
break;
|
||||||
|
case HEH:
|
||||||
|
if (i == len-1) {
|
||||||
|
s[i] = AE;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case TEH_MARBUTA:
|
||||||
|
s[i] = AE;
|
||||||
|
break;
|
||||||
|
case HEH_DOACHASHMEE:
|
||||||
|
s[i] = HEH;
|
||||||
|
break;
|
||||||
|
case REH:
|
||||||
|
if (i == 0) {
|
||||||
|
s[i] = RREH;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case RREH_ABOVE:
|
||||||
|
s[i] = RREH;
|
||||||
|
break;
|
||||||
|
case TATWEEL:
|
||||||
|
case KASRATAN:
|
||||||
|
case DAMMATAN:
|
||||||
|
case FATHATAN:
|
||||||
|
case FATHA:
|
||||||
|
case DAMMA:
|
||||||
|
case KASRA:
|
||||||
|
case SHADDA:
|
||||||
|
case SUKUN:
|
||||||
|
len = delete(s, i, len);
|
||||||
|
i--;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
if (Character.getType(s[i]) == Character.FORMAT) {
|
||||||
|
len = delete(s, i, len);
|
||||||
|
i--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,58 @@
|
||||||
|
package org.apache.lucene.analysis.ckb;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // javadoc @link
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link SoraniStemmer} to stem Sorani words.
|
||||||
|
* <p>
|
||||||
|
* To prevent terms from being stemmed use an instance of
|
||||||
|
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||||
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
|
* </p>
|
||||||
|
* @see SetKeywordMarkerFilter */
|
||||||
|
|
||||||
|
public final class SoraniStemFilter extends TokenFilter {
|
||||||
|
private final SoraniStemmer stemmer = new SoraniStemmer();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
public SoraniStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if(!keywordAttr.isKeyword()) {
|
||||||
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
|
termAtt.setLength(newlen);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,50 @@
|
||||||
|
package org.apache.lucene.analysis.ckb;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link SoraniStemFilter}.
|
||||||
|
* <pre class="prettyprint">
|
||||||
|
* <fieldType name="text_ckbstem" class="solr.TextField" positionIncrementGap="100">
|
||||||
|
* <analyzer>
|
||||||
|
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
|
* <filter class="solr.SoraniNormalizationFilterFactory"/>
|
||||||
|
* <filter class="solr.SoraniStemFilterFactory"/>
|
||||||
|
* </analyzer>
|
||||||
|
* </fieldType></pre>
|
||||||
|
*/
|
||||||
|
public class SoraniStemFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
|
/** Creates a new SoraniStemFilterFactory */
|
||||||
|
public SoraniStemFilterFactory(Map<String,String> args) {
|
||||||
|
super(args);
|
||||||
|
if (!args.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SoraniStemFilter create(TokenStream input) {
|
||||||
|
return new SoraniStemFilter(input);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,103 @@
|
||||||
|
package org.apache.lucene.analysis.ckb;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Light stemmer for Sorani
|
||||||
|
*/
|
||||||
|
public class SoraniStemmer {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stem an input buffer of Sorani text.
|
||||||
|
*
|
||||||
|
* @param s input buffer
|
||||||
|
* @param len length of input buffer
|
||||||
|
* @return length of input buffer after normalization
|
||||||
|
*/
|
||||||
|
public int stem(char s[], int len) {
|
||||||
|
// postposition
|
||||||
|
if (len > 5 && endsWith(s, len, "دا")) {
|
||||||
|
len -= 2;
|
||||||
|
} else if (len > 4 && endsWith(s, len, "نا")) {
|
||||||
|
len--;
|
||||||
|
} else if (len > 6 && endsWith(s, len, "ەوە")) {
|
||||||
|
len -= 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
// possessive pronoun
|
||||||
|
if (len > 6 && (endsWith(s, len, "مان") || endsWith(s, len, "یان") || endsWith(s, len, "تان"))) {
|
||||||
|
len -= 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
// indefinite singular ezafe
|
||||||
|
if (len > 6 && endsWith(s, len, "ێکی")) {
|
||||||
|
return len-3;
|
||||||
|
} else if (len > 7 && endsWith(s, len, "یەکی")) {
|
||||||
|
return len-4;
|
||||||
|
}
|
||||||
|
// indefinite singular
|
||||||
|
if (len > 5 && endsWith(s, len, "ێک")) {
|
||||||
|
return len-2;
|
||||||
|
} else if (len > 6 && endsWith(s, len, "یەک")) {
|
||||||
|
return len-3;
|
||||||
|
}
|
||||||
|
// definite singular
|
||||||
|
else if (len > 6 && endsWith(s, len, "ەکە")) {
|
||||||
|
return len-3;
|
||||||
|
} else if (len > 5 && endsWith(s, len, "کە")) {
|
||||||
|
return len-2;
|
||||||
|
}
|
||||||
|
// definite plural
|
||||||
|
else if (len > 7 && endsWith(s, len, "ەکان")) {
|
||||||
|
return len-4;
|
||||||
|
} else if (len > 6 && endsWith(s, len, "کان")) {
|
||||||
|
return len-3;
|
||||||
|
}
|
||||||
|
// indefinite plural ezafe
|
||||||
|
else if (len > 7 && endsWith(s, len, "یانی")) {
|
||||||
|
return len-4;
|
||||||
|
} else if (len > 6 && endsWith(s, len, "انی")) {
|
||||||
|
return len-3;
|
||||||
|
}
|
||||||
|
// indefinite plural
|
||||||
|
else if (len > 6 && endsWith(s, len, "یان")) {
|
||||||
|
return len-3;
|
||||||
|
} else if (len > 5 && endsWith(s, len, "ان")) {
|
||||||
|
return len-2;
|
||||||
|
}
|
||||||
|
// demonstrative plural
|
||||||
|
else if (len > 7 && endsWith(s, len, "یانە")) {
|
||||||
|
return len-4;
|
||||||
|
} else if (len > 6 && endsWith(s, len, "انە")) {
|
||||||
|
return len-3;
|
||||||
|
}
|
||||||
|
// demonstrative singular
|
||||||
|
else if (len > 5 && (endsWith(s, len, "ایە") || endsWith(s, len, "ەیە"))) {
|
||||||
|
return len-2;
|
||||||
|
} else if (len > 4 && endsWith(s, len, "ە")) {
|
||||||
|
return len-1;
|
||||||
|
}
|
||||||
|
// absolute singular ezafe
|
||||||
|
else if (len > 4 && endsWith(s, len, "ی")) {
|
||||||
|
return len-1;
|
||||||
|
}
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Analyzer for Sorani Kurdish.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -19,6 +19,8 @@ org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
|
||||||
org.apache.lucene.analysis.br.BrazilianStemFilterFactory
|
org.apache.lucene.analysis.br.BrazilianStemFilterFactory
|
||||||
org.apache.lucene.analysis.cjk.CJKBigramFilterFactory
|
org.apache.lucene.analysis.cjk.CJKBigramFilterFactory
|
||||||
org.apache.lucene.analysis.cjk.CJKWidthFilterFactory
|
org.apache.lucene.analysis.cjk.CJKWidthFilterFactory
|
||||||
|
org.apache.lucene.analysis.ckb.SoraniNormalizationFilterFactory
|
||||||
|
org.apache.lucene.analysis.ckb.SoraniStemFilterFactory
|
||||||
org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory
|
org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory
|
||||||
org.apache.lucene.analysis.commongrams.CommonGramsQueryFilterFactory
|
org.apache.lucene.analysis.commongrams.CommonGramsQueryFilterFactory
|
||||||
org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilterFactory
|
org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilterFactory
|
||||||
|
|
|
@ -0,0 +1,136 @@
|
||||||
|
# set of kurdish stopwords
|
||||||
|
# note these have been normalized with our scheme (e represented with U+06D5, etc)
|
||||||
|
# constructed from:
|
||||||
|
# * Fig 5 of "Building A Test Collection For Sorani Kurdish" (Esmaili et al)
|
||||||
|
# * "Sorani Kurdish: A Reference Grammar with selected readings" (Thackston)
|
||||||
|
# * Corpus-based analysis of 77M word Sorani collection: wikipedia, news, blogs, etc
|
||||||
|
|
||||||
|
# and
|
||||||
|
و
|
||||||
|
# which
|
||||||
|
کە
|
||||||
|
# of
|
||||||
|
ی
|
||||||
|
# made/did
|
||||||
|
کرد
|
||||||
|
# that/which
|
||||||
|
ئەوەی
|
||||||
|
# on/head
|
||||||
|
سەر
|
||||||
|
# two
|
||||||
|
دوو
|
||||||
|
# also
|
||||||
|
هەروەها
|
||||||
|
# from/that
|
||||||
|
لەو
|
||||||
|
# makes/does
|
||||||
|
دەکات
|
||||||
|
# some
|
||||||
|
چەند
|
||||||
|
# every
|
||||||
|
هەر
|
||||||
|
|
||||||
|
# demonstratives
|
||||||
|
# that
|
||||||
|
ئەو
|
||||||
|
# this
|
||||||
|
ئەم
|
||||||
|
|
||||||
|
# personal pronouns
|
||||||
|
# I
|
||||||
|
من
|
||||||
|
# we
|
||||||
|
ئێمە
|
||||||
|
# you
|
||||||
|
تۆ
|
||||||
|
# you
|
||||||
|
ئێوە
|
||||||
|
# he/she/it
|
||||||
|
ئەو
|
||||||
|
# they
|
||||||
|
ئەوان
|
||||||
|
|
||||||
|
# prepositions
|
||||||
|
# to/with/by
|
||||||
|
بە
|
||||||
|
پێ
|
||||||
|
# without
|
||||||
|
بەبێ
|
||||||
|
# along with/while/during
|
||||||
|
بەدەم
|
||||||
|
# in the opinion of
|
||||||
|
بەلای
|
||||||
|
# according to
|
||||||
|
بەپێی
|
||||||
|
# before
|
||||||
|
بەرلە
|
||||||
|
# in the direction of
|
||||||
|
بەرەوی
|
||||||
|
# in front of/toward
|
||||||
|
بەرەوە
|
||||||
|
# before/in the face of
|
||||||
|
بەردەم
|
||||||
|
# without
|
||||||
|
بێ
|
||||||
|
# except for
|
||||||
|
بێجگە
|
||||||
|
# for
|
||||||
|
بۆ
|
||||||
|
# on/in
|
||||||
|
دە
|
||||||
|
تێ
|
||||||
|
# with
|
||||||
|
دەگەڵ
|
||||||
|
# after
|
||||||
|
دوای
|
||||||
|
# except for/aside from
|
||||||
|
جگە
|
||||||
|
# in/from
|
||||||
|
لە
|
||||||
|
لێ
|
||||||
|
# in front of/before/because of
|
||||||
|
لەبەر
|
||||||
|
# between/among
|
||||||
|
لەبەینی
|
||||||
|
# concerning/about
|
||||||
|
لەبابەت
|
||||||
|
# concerning
|
||||||
|
لەبارەی
|
||||||
|
# instead of
|
||||||
|
لەباتی
|
||||||
|
# beside
|
||||||
|
لەبن
|
||||||
|
# instead of
|
||||||
|
لەبرێتی
|
||||||
|
# behind
|
||||||
|
لەدەم
|
||||||
|
# with/together with
|
||||||
|
لەگەڵ
|
||||||
|
# by
|
||||||
|
لەلایەن
|
||||||
|
# within
|
||||||
|
لەناو
|
||||||
|
# between/among
|
||||||
|
لەنێو
|
||||||
|
# for the sake of
|
||||||
|
لەپێناوی
|
||||||
|
# with respect to
|
||||||
|
لەرەوی
|
||||||
|
# by means of/for
|
||||||
|
لەرێ
|
||||||
|
# for the sake of
|
||||||
|
لەرێگا
|
||||||
|
# on/on top of/according to
|
||||||
|
لەسەر
|
||||||
|
# under
|
||||||
|
لەژێر
|
||||||
|
# between/among
|
||||||
|
ناو
|
||||||
|
# between/among
|
||||||
|
نێوان
|
||||||
|
# after
|
||||||
|
پاش
|
||||||
|
# before
|
||||||
|
پێش
|
||||||
|
# like
|
||||||
|
وەک
|
|
@ -0,0 +1,66 @@
|
||||||
|
package org.apache.lucene.analysis.ckb;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the Sorani analyzer
|
||||||
|
*/
|
||||||
|
public class TestSoraniAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This test fails with NPE when the stopwords file is missing in classpath
|
||||||
|
*/
|
||||||
|
public void testResourcesAvailable() {
|
||||||
|
new SoraniAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testStopwords() throws IOException {
|
||||||
|
Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
assertAnalyzesTo(a, "ئەم پیاوە", new String[] {"پیاو"});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testCustomStopwords() throws IOException {
|
||||||
|
Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
|
||||||
|
assertAnalyzesTo(a, "ئەم پیاوە",
|
||||||
|
new String[] {"ئەم", "پیاو"});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testReusableTokenStream() throws IOException {
|
||||||
|
Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
assertAnalyzesTo(a, "پیاوە", new String[] {"پیاو"});
|
||||||
|
assertAnalyzesTo(a, "پیاو", new String[] {"پیاو"});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testWithStemExclusionSet() throws IOException {
|
||||||
|
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||||
|
set.add("پیاوە");
|
||||||
|
Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
||||||
|
assertAnalyzesTo(a, "پیاوە", new String[] { "پیاوە" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random(), new SoraniAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,92 @@
|
||||||
|
package org.apache.lucene.analysis.ckb;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests normalization for Sorani (this is more critical than stemming...)
|
||||||
|
*/
|
||||||
|
public class TestSoraniNormalizationFilter extends BaseTokenStreamTestCase {
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
||||||
|
return new TokenStreamComponents(tokenizer, new SoraniNormalizationFilter(tokenizer));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
public void testY() throws Exception {
|
||||||
|
checkOneTerm(a, "\u064A", "\u06CC");
|
||||||
|
checkOneTerm(a, "\u0649", "\u06CC");
|
||||||
|
checkOneTerm(a, "\u06CC", "\u06CC");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testK() throws Exception {
|
||||||
|
checkOneTerm(a, "\u0643", "\u06A9");
|
||||||
|
checkOneTerm(a, "\u06A9", "\u06A9");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testH() throws Exception {
|
||||||
|
// initial
|
||||||
|
checkOneTerm(a, "\u0647\u200C", "\u06D5");
|
||||||
|
// medial
|
||||||
|
checkOneTerm(a, "\u0647\u200C\u06A9", "\u06D5\u06A9");
|
||||||
|
|
||||||
|
checkOneTerm(a, "\u06BE", "\u0647");
|
||||||
|
checkOneTerm(a, "\u0629", "\u06D5");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFinalH() throws Exception {
|
||||||
|
// always (and in final form by def), so frequently omitted
|
||||||
|
checkOneTerm(a, "\u0647\u0647\u0647", "\u0647\u0647\u06D5");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRR() throws Exception {
|
||||||
|
checkOneTerm(a, "\u0692", "\u0695");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testInitialRR() throws Exception {
|
||||||
|
// always, so frequently omitted
|
||||||
|
checkOneTerm(a, "\u0631\u0631\u0631", "\u0695\u0631\u0631");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRemove() throws Exception {
|
||||||
|
checkOneTerm(a, "\u0640", "");
|
||||||
|
checkOneTerm(a, "\u064B", "");
|
||||||
|
checkOneTerm(a, "\u064C", "");
|
||||||
|
checkOneTerm(a, "\u064D", "");
|
||||||
|
checkOneTerm(a, "\u064E", "");
|
||||||
|
checkOneTerm(a, "\u064F", "");
|
||||||
|
checkOneTerm(a, "\u0650", "");
|
||||||
|
checkOneTerm(a, "\u0651", "");
|
||||||
|
checkOneTerm(a, "\u0652", "");
|
||||||
|
// we peek backwards in this case to look for h+200C, ensure this works
|
||||||
|
checkOneTerm(a, "\u200C", "");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEmptyTerm() throws IOException {
|
||||||
|
checkOneTerm(a, "", "");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,48 @@
|
||||||
|
package org.apache.lucene.analysis.ckb;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the Sorani normalization factory is working.
|
||||||
|
*/
|
||||||
|
public class TestSoraniNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||||
|
|
||||||
|
public void testNormalization() throws Exception {
|
||||||
|
Reader reader = new StringReader("پیــــاوەکان");
|
||||||
|
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
stream = tokenFilterFactory("SoraniNormalization").create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "پیاوەکان" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test that bogus arguments result in exception */
|
||||||
|
public void testBogusArguments() throws Exception {
|
||||||
|
try {
|
||||||
|
tokenFilterFactory("SoraniNormalization", "bogusArg", "bogusValue");
|
||||||
|
fail();
|
||||||
|
} catch (IllegalArgumentException expected) {
|
||||||
|
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,100 @@
|
||||||
|
package org.apache.lucene.analysis.ckb;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the Sorani Stemmer.
|
||||||
|
*/
|
||||||
|
public class TestSoraniStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
SoraniAnalyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
|
||||||
|
public void testIndefiniteSingular() throws Exception {
|
||||||
|
checkOneTerm(a, "پیاوێک", "پیاو"); // -ek
|
||||||
|
checkOneTerm(a, "دەرگایەک", "دەرگا"); // -yek
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDefiniteSingular() throws Exception {
|
||||||
|
checkOneTerm(a, "پیاوەكە", "پیاو"); // -aka
|
||||||
|
checkOneTerm(a, "دەرگاكە", "دەرگا"); // -ka
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDemonstrativeSingular() throws Exception {
|
||||||
|
checkOneTerm(a, "کتاویە", "کتاوی"); // -a
|
||||||
|
checkOneTerm(a, "دەرگایە", "دەرگا"); // -ya
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testIndefinitePlural() throws Exception {
|
||||||
|
checkOneTerm(a, "پیاوان", "پیاو"); // -An
|
||||||
|
checkOneTerm(a, "دەرگایان", "دەرگا"); // -yAn
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDefinitePlural() throws Exception {
|
||||||
|
checkOneTerm(a, "پیاوەکان", "پیاو"); // -akAn
|
||||||
|
checkOneTerm(a, "دەرگاکان", "دەرگا"); // -kAn
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDemonstrativePlural() throws Exception {
|
||||||
|
checkOneTerm(a, "پیاوانە", "پیاو"); // -Ana
|
||||||
|
checkOneTerm(a, "دەرگایانە", "دەرگا"); // -yAna
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEzafe() throws Exception {
|
||||||
|
checkOneTerm(a, "هۆتیلی", "هۆتیل"); // singular
|
||||||
|
checkOneTerm(a, "هۆتیلێکی", "هۆتیل"); // indefinite
|
||||||
|
checkOneTerm(a, "هۆتیلانی", "هۆتیل"); // plural
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testPostpositions() throws Exception {
|
||||||
|
checkOneTerm(a, "دوورەوە", "دوور"); // -awa
|
||||||
|
checkOneTerm(a, "نیوەشەودا", "نیوەشەو"); // -dA
|
||||||
|
checkOneTerm(a, "سۆرانا", "سۆران"); // -A
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testPossessives() throws Exception {
|
||||||
|
checkOneTerm(a, "پارەمان", "پارە"); // -mAn
|
||||||
|
checkOneTerm(a, "پارەتان", "پارە"); // -tAn
|
||||||
|
checkOneTerm(a, "پارەیان", "پارە"); // -yAn
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEmptyTerm() throws IOException {
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
||||||
|
return new TokenStreamComponents(tokenizer, new SoraniStemFilter(tokenizer));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "", "");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test against a basic vocabulary file */
|
||||||
|
public void testVocabulary() throws Exception {
|
||||||
|
// top 8k words or so: freq > 1000
|
||||||
|
assertVocabulary(a, getDataFile("ckbtestdata.zip"), "testdata.txt");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,48 @@
|
||||||
|
package org.apache.lucene.analysis.ckb;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the Sorani stem factory is working.
|
||||||
|
*/
|
||||||
|
public class TestSoraniStemFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||||
|
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("پیاوەکان");
|
||||||
|
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
stream = tokenFilterFactory("SoraniStem").create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "پیاو" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test that bogus arguments result in exception */
|
||||||
|
public void testBogusArguments() throws Exception {
|
||||||
|
try {
|
||||||
|
tokenFilterFactory("SoraniStem", "bogusArg", "bogusValue");
|
||||||
|
fail();
|
||||||
|
} catch (IllegalArgumentException expected) {
|
||||||
|
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Binary file not shown.
|
@ -612,7 +612,7 @@
|
||||||
|
|
||||||
<property name="analysis-common.res.dir" value="../lucene/analysis/common/src/resources/org/apache/lucene/analysis"/>
|
<property name="analysis-common.res.dir" value="../lucene/analysis/common/src/resources/org/apache/lucene/analysis"/>
|
||||||
<property name="analysis-kuromoji.res.dir" value="../lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis"/>
|
<property name="analysis-kuromoji.res.dir" value="../lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis"/>
|
||||||
<property name="analysis.conf.dest" value="${example}/solr/conf/lang"/>
|
<property name="analysis.conf.dest" value="${example}/solr/collection1/conf/lang"/>
|
||||||
|
|
||||||
<target name="sync-analyzers"
|
<target name="sync-analyzers"
|
||||||
description="Committers' Helper: synchronizes analysis resources (e.g. stoplists) to the example">
|
description="Committers' Helper: synchronizes analysis resources (e.g. stoplists) to the example">
|
||||||
|
@ -625,6 +625,9 @@
|
||||||
<!-- catalan -->
|
<!-- catalan -->
|
||||||
<copy verbose="true" file="${analysis-common.res.dir}/ca/stopwords.txt"
|
<copy verbose="true" file="${analysis-common.res.dir}/ca/stopwords.txt"
|
||||||
tofile="${analysis.conf.dest}/stopwords_ca.txt"/>
|
tofile="${analysis.conf.dest}/stopwords_ca.txt"/>
|
||||||
|
<!-- kurdish -->
|
||||||
|
<copy verbose="true" file="${analysis-common.res.dir}/ckb/stopwords.txt"
|
||||||
|
tofile="${analysis.conf.dest}/stopwords_ckb.txt"/>
|
||||||
<!-- czech -->
|
<!-- czech -->
|
||||||
<copy verbose="true" file="${analysis-common.res.dir}/cz/stopwords.txt"
|
<copy verbose="true" file="${analysis-common.res.dir}/cz/stopwords.txt"
|
||||||
tofile="${analysis.conf.dest}/stopwords_cz.txt"/>
|
tofile="${analysis.conf.dest}/stopwords_cz.txt"/>
|
||||||
|
|
|
@ -0,0 +1,136 @@
|
||||||
|
# set of kurdish stopwords
|
||||||
|
# note these have been normalized with our scheme (e represented with U+06D5, etc)
|
||||||
|
# constructed from:
|
||||||
|
# * Fig 5 of "Building A Test Collection For Sorani Kurdish" (Esmaili et al)
|
||||||
|
# * "Sorani Kurdish: A Reference Grammar with selected readings" (Thackston)
|
||||||
|
# * Corpus-based analysis of 77M word Sorani collection: wikipedia, news, blogs, etc
|
||||||
|
|
||||||
|
# and
|
||||||
|
و
|
||||||
|
# which
|
||||||
|
کە
|
||||||
|
# of
|
||||||
|
ی
|
||||||
|
# made/did
|
||||||
|
کرد
|
||||||
|
# that/which
|
||||||
|
ئەوەی
|
||||||
|
# on/head
|
||||||
|
سەر
|
||||||
|
# two
|
||||||
|
دوو
|
||||||
|
# also
|
||||||
|
هەروەها
|
||||||
|
# from/that
|
||||||
|
لەو
|
||||||
|
# makes/does
|
||||||
|
دەکات
|
||||||
|
# some
|
||||||
|
چەند
|
||||||
|
# every
|
||||||
|
هەر
|
||||||
|
|
||||||
|
# demonstratives
|
||||||
|
# that
|
||||||
|
ئەو
|
||||||
|
# this
|
||||||
|
ئەم
|
||||||
|
|
||||||
|
# personal pronouns
|
||||||
|
# I
|
||||||
|
من
|
||||||
|
# we
|
||||||
|
ئێمە
|
||||||
|
# you
|
||||||
|
تۆ
|
||||||
|
# you
|
||||||
|
ئێوە
|
||||||
|
# he/she/it
|
||||||
|
ئەو
|
||||||
|
# they
|
||||||
|
ئەوان
|
||||||
|
|
||||||
|
# prepositions
|
||||||
|
# to/with/by
|
||||||
|
بە
|
||||||
|
پێ
|
||||||
|
# without
|
||||||
|
بەبێ
|
||||||
|
# along with/while/during
|
||||||
|
بەدەم
|
||||||
|
# in the opinion of
|
||||||
|
بەلای
|
||||||
|
# according to
|
||||||
|
بەپێی
|
||||||
|
# before
|
||||||
|
بەرلە
|
||||||
|
# in the direction of
|
||||||
|
بەرەوی
|
||||||
|
# in front of/toward
|
||||||
|
بەرەوە
|
||||||
|
# before/in the face of
|
||||||
|
بەردەم
|
||||||
|
# without
|
||||||
|
بێ
|
||||||
|
# except for
|
||||||
|
بێجگە
|
||||||
|
# for
|
||||||
|
بۆ
|
||||||
|
# on/in
|
||||||
|
دە
|
||||||
|
تێ
|
||||||
|
# with
|
||||||
|
دەگەڵ
|
||||||
|
# after
|
||||||
|
دوای
|
||||||
|
# except for/aside from
|
||||||
|
جگە
|
||||||
|
# in/from
|
||||||
|
لە
|
||||||
|
لێ
|
||||||
|
# in front of/before/because of
|
||||||
|
لەبەر
|
||||||
|
# between/among
|
||||||
|
لەبەینی
|
||||||
|
# concerning/about
|
||||||
|
لەبابەت
|
||||||
|
# concerning
|
||||||
|
لەبارەی
|
||||||
|
# instead of
|
||||||
|
لەباتی
|
||||||
|
# beside
|
||||||
|
لەبن
|
||||||
|
# instead of
|
||||||
|
لەبرێتی
|
||||||
|
# behind
|
||||||
|
لەدەم
|
||||||
|
# with/together with
|
||||||
|
لەگەڵ
|
||||||
|
# by
|
||||||
|
لەلایەن
|
||||||
|
# within
|
||||||
|
لەناو
|
||||||
|
# between/among
|
||||||
|
لەنێو
|
||||||
|
# for the sake of
|
||||||
|
لەپێناوی
|
||||||
|
# with respect to
|
||||||
|
لەرەوی
|
||||||
|
# by means of/for
|
||||||
|
لەرێ
|
||||||
|
# for the sake of
|
||||||
|
لەرێگا
|
||||||
|
# on/on top of/according to
|
||||||
|
لەسەر
|
||||||
|
# under
|
||||||
|
لەژێر
|
||||||
|
# between/among
|
||||||
|
ناو
|
||||||
|
# between/among
|
||||||
|
نێوان
|
||||||
|
# after
|
||||||
|
پاش
|
||||||
|
# before
|
||||||
|
پێش
|
||||||
|
# like
|
||||||
|
وەک
|
|
@ -779,6 +779,18 @@
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
|
||||||
|
<!-- Kurdish -->
|
||||||
|
<fieldType name="text_ckb" class="solr.TextField" positionIncrementGap="100">
|
||||||
|
<analyzer>
|
||||||
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
|
<filter class="solr.SoraniNormalizationFilterFactory"/>
|
||||||
|
<!-- for any latin text -->
|
||||||
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ckb.txt"/>
|
||||||
|
<filter class="solr.SoraniStemFilterFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
|
||||||
<!-- Czech -->
|
<!-- Czech -->
|
||||||
<fieldType name="text_cz" class="solr.TextField" positionIncrementGap="100">
|
<fieldType name="text_cz" class="solr.TextField" positionIncrementGap="100">
|
||||||
<analyzer>
|
<analyzer>
|
||||||
|
|
Loading…
Reference in New Issue