mirror of https://github.com/apache/lucene.git
LUCENE-2117: SnowballAnalyzer uses TurkishLowerCaseFilter instead of LowercaseFilter to correctly handle the unique Turkish casing behavior if used with Version > 3.0 and the TurkishStemmer.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@888787 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cc619905c4
commit
43c475d296
|
@ -2,6 +2,13 @@ Lucene contrib change Log
|
|||
|
||||
======================= Trunk (not yet released) =======================
|
||||
|
||||
Changes in runtime behavior
|
||||
|
||||
* LUCENE-2117: SnowballAnalyzer uses TurkishLowerCaseFilter instead of
|
||||
LowercaseFilter to correctly handle the unique Turkish casing behavior if
|
||||
used with Version > 3.0 and the TurkishStemmer.
|
||||
(Robert Muir via Simon Willnauer)
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-2068: Fixed ReverseStringFilter which was not aware of supplementary
|
||||
|
@ -39,6 +46,10 @@ New features
|
|||
|
||||
Build
|
||||
|
||||
* LUCENE-2117: SnowballAnalyzer now holds a runtime-dependency on
|
||||
contrib-analyzers to correctly handle the unique Turkish casing behavior.
|
||||
(Robert Muir via Simon Willnauer)
|
||||
|
||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||
into core, and moved the ICU-based collation support into contrib/icu.
|
||||
(Robert Muir)
|
||||
|
|
|
@ -30,6 +30,15 @@
|
|||
<property name="snowball.root" value="snowball/website"/>
|
||||
<property name="bin.dir" location="bin"/>
|
||||
|
||||
<property name="analyzers.jar" location="${common.dir}/build/contrib/analyzers/common/lucene-analyzers-${version}.jar"/>
|
||||
<available property="analyzers.jar.present" type="file" file="${analyzers.jar}"/>
|
||||
|
||||
<path id="classpath">
|
||||
<pathelement path="${lucene.jar}"/>
|
||||
<pathelement path="${analyzers.jar}"/>
|
||||
<pathelement path="${project.classpath}"/>
|
||||
</path>
|
||||
|
||||
<target name="jar" depends="compile" description="Create JAR">
|
||||
<jarify>
|
||||
<metainf-includes>
|
||||
|
@ -121,5 +130,11 @@
|
|||
|
||||
</target>
|
||||
|
||||
<target name="compile-core" depends="build-analyzers, common.compile-core" />
|
||||
|
||||
<target name="build-analyzers" unless="analyzers.jar.present">
|
||||
<echo>Snowball building dependency ${analyzers.jar}</echo>
|
||||
<ant antfile="../analyzers/build.xml" target="default" inheritall="false" dir="../analyzers" />
|
||||
</target>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -33,4 +33,11 @@
|
|||
<version>@version@</version>
|
||||
<description>Snowball Analyzers</description>
|
||||
<packaging>jar</packaging>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-analyzers</artifactId>
|
||||
<version>@version@</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.snowball;
|
|||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.standard.*;
|
||||
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -33,7 +34,11 @@ import java.util.Set;
|
|||
* {@link org.tartarus.snowball.ext.EnglishStemmer} is named "English".
|
||||
*
|
||||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
* dependent settings as {@link StandardAnalyzer}, with the following addition:
|
||||
* <ul>
|
||||
* <li> As of 3.1, uses {@link TurkishLowerCaseFilter} for Turkish language.
|
||||
* </ul>
|
||||
* </p>
|
||||
*/
|
||||
public class SnowballAnalyzer extends Analyzer {
|
||||
private String name;
|
||||
|
@ -60,7 +65,11 @@ public class SnowballAnalyzer extends Analyzer {
|
|||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new StandardTokenizer(matchVersion, reader);
|
||||
result = new StandardFilter(result);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
// Use a special lowercase filter for turkish, the stemmer expects it.
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
|
||||
result = new TurkishLowerCaseFilter(result);
|
||||
else
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
if (stopSet != null)
|
||||
result = new StopFilter(matchVersion,
|
||||
result, stopSet);
|
||||
|
@ -91,7 +100,11 @@ public class SnowballAnalyzer extends Analyzer {
|
|||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(matchVersion, reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new LowerCaseFilter(matchVersion, streams.result);
|
||||
// Use a special lowercase filter for turkish, the stemmer expects it.
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
|
||||
streams.result = new TurkishLowerCaseFilter(streams.result);
|
||||
else
|
||||
streams.result = new LowerCaseFilter(matchVersion, streams.result);
|
||||
if (stopSet != null)
|
||||
streams.result = new StopFilter(matchVersion,
|
||||
streams.result, stopSet);
|
||||
|
|
|
@ -22,12 +22,20 @@ import java.io.IOException;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; // javadoc @link
|
||||
import org.apache.lucene.analysis.LowerCaseFilter; // javadoc @link
|
||||
import org.tartarus.snowball.SnowballProgram;
|
||||
|
||||
/**
|
||||
* A filter that stems words using a Snowball-generated stemmer.
|
||||
*
|
||||
* Available stemmers are listed in {@link org.tartarus.snowball.ext}.
|
||||
* <p><b>NOTE</b>: SnowballFilter expects lowercased text.
|
||||
* <ul>
|
||||
* <li>For the Turkish language, see {@link TurkishLowerCaseFilter}.
|
||||
* <li>For other languages, see {@link LowerCaseFilter}.
|
||||
* </ul>
|
||||
* </p>
|
||||
*/
|
||||
public final class SnowballFilter extends TokenFilter {
|
||||
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.snowball;
|
|||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
@ -41,6 +40,44 @@ public class TestSnowball extends BaseTokenStreamTestCase {
|
|||
new String[]{"he", "abhor", "accent"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test english lowercasing. Test both cases (pre-3.1 and post-3.1) to ensure
|
||||
* we lowercase I correct for non-Turkish languages in either case.
|
||||
*/
|
||||
public void testEnglishLowerCase() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English");
|
||||
assertAnalyzesTo(a, "cryogenic", new String[] { "cryogen" });
|
||||
assertAnalyzesTo(a, "CRYOGENIC", new String[] { "cryogen" });
|
||||
|
||||
Analyzer b = new SnowballAnalyzer(Version.LUCENE_30, "English");
|
||||
assertAnalyzesTo(b, "cryogenic", new String[] { "cryogen" });
|
||||
assertAnalyzesTo(b, "CRYOGENIC", new String[] { "cryogen" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test turkish lowercasing
|
||||
*/
|
||||
public void testTurkish() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "Turkish");
|
||||
|
||||
assertAnalyzesTo(a, "ağacı", new String[] { "ağaç" });
|
||||
assertAnalyzesTo(a, "AĞACI", new String[] { "ağaç" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test turkish lowercasing (old buggy behavior)
|
||||
* @deprecated Remove this when support for 3.0 indexes is no longer required
|
||||
*/
|
||||
public void testTurkishBWComp() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer(Version.LUCENE_30, "Turkish");
|
||||
// AĞACI in turkish lowercases to ağacı, but with lowercase filter ağaci.
|
||||
// this fails due to wrong casing, because the stemmer
|
||||
// will only remove -ı, not -i
|
||||
assertAnalyzesTo(a, "ağacı", new String[] { "ağaç" });
|
||||
assertAnalyzesTo(a, "AĞACI", new String[] { "ağaci" });
|
||||
}
|
||||
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English");
|
||||
assertAnalyzesToReuse(a, "he abhorred accents",
|
||||
|
|
Loading…
Reference in New Issue