mirror of https://github.com/apache/lucene.git
LUCENE-5482: Improve default TurkishAnalyzer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1573066 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2356994e0c
commit
4193bce372
|
@ -81,6 +81,9 @@ New Features
|
||||||
additions and updates (to change weight or payload of an existing
|
additions and updates (to change weight or payload of an existing
|
||||||
suggestion). (Mike McCandless)
|
suggestion). (Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-5482: Improve default TurkishAnalyzer by adding apostrophe
|
||||||
|
handling suitable for Turkish. (Ahmet Arslan via Robert Muir)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues
|
* LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues
|
||||||
|
|
|
@ -0,0 +1,59 @@
|
||||||
|
package org.apache.lucene.analysis.tr;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Strips all characters after an apostrophe (including the apostrophe itself).
|
||||||
|
* <p>
|
||||||
|
* In Turkish, apostrophe is used to separate suffixes from proper names
|
||||||
|
* (continent, sea, river, lake, mountain, upland, proper names related to
|
||||||
|
* religion and mythology). This filter intended to be used before stem filters.
|
||||||
|
* For more information, see <a href="http://www.ipcsit.com/vol57/015-ICNI2012-M021.pdf">
|
||||||
|
* Role of Apostrophes in Turkish Information Retrieval</a>
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public final class ApostropheFilter extends TokenFilter {
|
||||||
|
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
|
public ApostropheFilter(TokenStream in) {
|
||||||
|
super(in);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final boolean incrementToken() throws IOException {
|
||||||
|
if (!input.incrementToken())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
final char[] buffer = termAtt.buffer();
|
||||||
|
final int length = termAtt.length();
|
||||||
|
|
||||||
|
for (int i = 0; i < length; i++)
|
||||||
|
if (buffer[i] == '\'' || buffer[i] == '\u2019') {
|
||||||
|
termAtt.setLength(i);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,49 @@
|
||||||
|
package org.apache.lucene.analysis.tr;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link ApostropheFilter}.
|
||||||
|
* <pre class="prettyprint">
|
||||||
|
* <fieldType name="text_tr_lower_apostrophes" class="solr.TextField" positionIncrementGap="100">
|
||||||
|
* <analyzer>
|
||||||
|
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
|
* <filter class="solr.ApostropheFilterFactory"/>
|
||||||
|
* <filter class="solr.TurkishLowerCaseFilterFactory"/>
|
||||||
|
* </analyzer>
|
||||||
|
* </fieldType></pre>
|
||||||
|
*/
|
||||||
|
public class ApostropheFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
|
public ApostropheFilterFactory(Map<String, String> args) {
|
||||||
|
super(args);
|
||||||
|
if (!args.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("Unknown parameter(s): " + args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new ApostropheFilter(input);
|
||||||
|
}
|
||||||
|
}
|
|
@ -122,6 +122,8 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||||
TokenStream result = new StandardFilter(matchVersion, source);
|
TokenStream result = new StandardFilter(matchVersion, source);
|
||||||
|
if(matchVersion.onOrAfter(Version.LUCENE_48))
|
||||||
|
result = new ApostropheFilter(result);
|
||||||
result = new TurkishLowerCaseFilter(result);
|
result = new TurkishLowerCaseFilter(result);
|
||||||
result = new StopFilter(matchVersion, result, stopwords);
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
if(!stemExclusionSet.isEmpty())
|
if(!stemExclusionSet.isEmpty())
|
||||||
|
|
|
@ -13,6 +13,7 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
org.apache.lucene.analysis.tr.ApostropheFilterFactory
|
||||||
org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory
|
org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory
|
||||||
org.apache.lucene.analysis.ar.ArabicStemFilterFactory
|
org.apache.lucene.analysis.ar.ArabicStemFilterFactory
|
||||||
org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
|
org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
|
||||||
|
|
|
@ -0,0 +1,32 @@
|
||||||
|
package org.apache.lucene.analysis.tr;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
|
|
||||||
|
public class TestApostropheFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
public void testApostropheFilter() throws Exception {
|
||||||
|
TokenStream stream = whitespaceMockTokenizer("Türkiye'de 2003'te Van Gölü'nü gördüm");
|
||||||
|
stream = new TurkishLowerCaseFilter(stream);
|
||||||
|
stream = new ApostropheFilter(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[]{"türkiye", "2003", "van", "gölü", "gördüm"});
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,55 @@
|
||||||
|
package org.apache.lucene.analysis.tr;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the apostrophe filter factory is working.
|
||||||
|
*/
|
||||||
|
public class TestApostropheFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||||
|
/**
|
||||||
|
* Ensure the filter actually removes characters after an apostrophe.
|
||||||
|
*/
|
||||||
|
public void testApostrophes() throws Exception {
|
||||||
|
Reader reader = new StringReader("Türkiye'de 2003'te Van Gölü'nü gördüm");
|
||||||
|
TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
|
((Tokenizer) stream).setReader(reader);
|
||||||
|
stream = tokenFilterFactory("Apostrophe").create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[]{"Türkiye", "2003", "Van", "Gölü", "gördüm"});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test that bogus arguments result in exception
|
||||||
|
*/
|
||||||
|
public void testBogusArguments() throws Exception {
|
||||||
|
try {
|
||||||
|
tokenFilterFactory("Apostrophe", "bogusArg", "bogusValue");
|
||||||
|
fail();
|
||||||
|
} catch (IllegalArgumentException expected) {
|
||||||
|
assertTrue(expected.getMessage().contains("Unknown parameter(s):"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -38,6 +38,9 @@ public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTerm(a, "ağaç", "ağaç");
|
checkOneTerm(a, "ağaç", "ağaç");
|
||||||
// stopword
|
// stopword
|
||||||
assertAnalyzesTo(a, "dolayı", new String[] {});
|
assertAnalyzesTo(a, "dolayı", new String[] {});
|
||||||
|
// apostrophes
|
||||||
|
checkOneTerm(a, "Kıbrıs'ta", "kıbrıs");
|
||||||
|
assertAnalyzesTo(a, "Van Gölü'ne", new String[]{"van", "göl"});
|
||||||
}
|
}
|
||||||
|
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
|
|
|
@ -1124,6 +1124,7 @@
|
||||||
<fieldType name="text_tr" class="solr.TextField" positionIncrementGap="100">
|
<fieldType name="text_tr" class="solr.TextField" positionIncrementGap="100">
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
|
<filter class="solr.ApostropheFilterFactory"/>
|
||||||
<filter class="solr.TurkishLowerCaseFilterFactory"/>
|
<filter class="solr.TurkishLowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt" />
|
<filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt" />
|
||||||
<filter class="solr.SnowballPorterFilterFactory" language="Turkish"/>
|
<filter class="solr.SnowballPorterFilterFactory" language="Turkish"/>
|
||||||
|
|
Loading…
Reference in New Issue