LUCENE-2117: SnowballAnalyzer uses TurkishLowerCaseFilter instead of LowercaseFilter to correctly handle the unique Turkish casing behavior if used with Version > 3.0 and the TurkishStemmer.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@888787 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2009-12-09 12:47:37 +00:00
parent cc619905c4
commit 43c475d296
6 changed files with 95 additions and 4 deletions

View File

@ -2,6 +2,13 @@ Lucene contrib change Log
======================= Trunk (not yet released) =======================
Changes in runtime behavior
* LUCENE-2117: SnowballAnalyzer uses TurkishLowerCaseFilter instead of
LowercaseFilter to correctly handle the unique Turkish casing behavior if
used with Version > 3.0 and the TurkishStemmer.
(Robert Muir via Simon Willnauer)
Bug fixes
* LUCENE-2068: Fixed ReverseStringFilter which was not aware of supplementary
@ -39,6 +46,10 @@ New features
Build
* LUCENE-2117: SnowballAnalyzer now holds a runtime-dependency on
contrib-analyzers to correctly handle the unique Turkish casing behavior.
(Robert Muir via Simon Willnauer)
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
into core, and moved the ICU-based collation support into contrib/icu.
(Robert Muir)

View File

@ -30,6 +30,15 @@
<property name="snowball.root" value="snowball/website"/>
<property name="bin.dir" location="bin"/>
<property name="analyzers.jar" location="${common.dir}/build/contrib/analyzers/common/lucene-analyzers-${version}.jar"/>
<available property="analyzers.jar.present" type="file" file="${analyzers.jar}"/>
<path id="classpath">
<pathelement path="${lucene.jar}"/>
<pathelement path="${analyzers.jar}"/>
<pathelement path="${project.classpath}"/>
</path>
<target name="jar" depends="compile" description="Create JAR">
<jarify>
<metainf-includes>
@ -121,5 +130,11 @@
</target>
<target name="compile-core" depends="build-analyzers, common.compile-core" />
<target name="build-analyzers" unless="analyzers.jar.present">
<echo>Snowball building dependency ${analyzers.jar}</echo>
<ant antfile="../analyzers/build.xml" target="default" inheritall="false" dir="../analyzers" />
</target>
</project>

View File

@ -33,4 +33,11 @@
<version>@version@</version>
<description>Snowball Analyzers</description>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers</artifactId>
<version>@version@</version>
</dependency>
</dependencies>
</project>

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.snowball;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
import org.apache.lucene.util.Version;
import java.io.IOException;
@ -33,7 +34,11 @@ import java.util.Set;
* {@link org.tartarus.snowball.ext.EnglishStemmer} is named "English".
*
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
* dependent settings as {@link StandardAnalyzer}, with the following addition:
* <ul>
* <li> As of 3.1, uses {@link TurkishLowerCaseFilter} for Turkish language.
* </ul>
* </p>
*/
public class SnowballAnalyzer extends Analyzer {
private String name;
@ -60,7 +65,11 @@ public class SnowballAnalyzer extends Analyzer {
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(matchVersion, reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(matchVersion, result);
// Use a special lowercase filter for turkish, the stemmer expects it.
if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
result = new TurkishLowerCaseFilter(result);
else
result = new LowerCaseFilter(matchVersion, result);
if (stopSet != null)
result = new StopFilter(matchVersion,
result, stopSet);
@ -91,7 +100,11 @@ public class SnowballAnalyzer extends Analyzer {
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new StandardFilter(streams.source);
streams.result = new LowerCaseFilter(matchVersion, streams.result);
// Use a special lowercase filter for turkish, the stemmer expects it.
if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
streams.result = new TurkishLowerCaseFilter(streams.result);
else
streams.result = new LowerCaseFilter(matchVersion, streams.result);
if (stopSet != null)
streams.result = new StopFilter(matchVersion,
streams.result, stopSet);

View File

@ -22,12 +22,20 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; // javadoc @link
import org.apache.lucene.analysis.LowerCaseFilter; // javadoc @link
import org.tartarus.snowball.SnowballProgram;
/**
* A filter that stems words using a Snowball-generated stemmer.
*
* Available stemmers are listed in {@link org.tartarus.snowball.ext}.
* <p><b>NOTE</b>: SnowballFilter expects lowercased text.
* <ul>
* <li>For the Turkish language, see {@link TurkishLowerCaseFilter}.
* <li>For other languages, see {@link LowerCaseFilter}.
* </ul>
* </p>
*/
public final class SnowballFilter extends TokenFilter {

View File

@ -18,7 +18,6 @@ package org.apache.lucene.analysis.snowball;
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
@ -41,6 +40,44 @@ public class TestSnowball extends BaseTokenStreamTestCase {
new String[]{"he", "abhor", "accent"});
}
/**
* Test english lowercasing. Test both cases (pre-3.1 and post-3.1) to ensure
* we lowercase I correct for non-Turkish languages in either case.
*/
public void testEnglishLowerCase() throws Exception {
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English");
assertAnalyzesTo(a, "cryogenic", new String[] { "cryogen" });
assertAnalyzesTo(a, "CRYOGENIC", new String[] { "cryogen" });
Analyzer b = new SnowballAnalyzer(Version.LUCENE_30, "English");
assertAnalyzesTo(b, "cryogenic", new String[] { "cryogen" });
assertAnalyzesTo(b, "CRYOGENIC", new String[] { "cryogen" });
}
/**
* Test turkish lowercasing
*/
public void testTurkish() throws Exception {
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "Turkish");
assertAnalyzesTo(a, "ağacı", new String[] { "ağaç" });
assertAnalyzesTo(a, "AĞACI", new String[] { "ağaç" });
}
/**
* Test turkish lowercasing (old buggy behavior)
* @deprecated Remove this when support for 3.0 indexes is no longer required
*/
public void testTurkishBWComp() throws Exception {
Analyzer a = new SnowballAnalyzer(Version.LUCENE_30, "Turkish");
// AĞACI in turkish lowercases to ağacı, but with lowercase filter ağaci.
// this fails due to wrong casing, because the stemmer
// will only remove -ı, not -i
assertAnalyzesTo(a, "ağacı", new String[] { "ağaç" });
assertAnalyzesTo(a, "AĞACI", new String[] { "ağaci" });
}
public void testReusableTokenStream() throws Exception {
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English");
assertAnalyzesToReuse(a, "he abhorred accents",