LUCENE-2203: use the snowball vocabulary tests for improved testing

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@898950 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-01-13 21:51:51 +00:00
parent 7bdac8e555
commit 9da810e7dd
2 changed files with 114 additions and 0 deletions

View File

@ -131,10 +131,26 @@
</target> </target>
<target name="compile-core" depends="build-analyzers, common.compile-core" /> <target name="compile-core" depends="build-analyzers, common.compile-core" />
<target name="compile-test" depends="download-vocab-tests, common.compile-test" />
<target name="build-analyzers" unless="analyzers.jar.present"> <target name="build-analyzers" unless="analyzers.jar.present">
<echo>Snowball building dependency ${analyzers.jar}</echo> <echo>Snowball building dependency ${analyzers.jar}</echo>
<ant antfile="../analyzers/build.xml" target="default" inheritall="false" dir="../analyzers" /> <ant antfile="../analyzers/build.xml" target="default" inheritall="false" dir="../analyzers" />
</target> </target>
<property name="snowball.vocab.rev" value="500"/>
<property name="snowball.vocab.url"
value="svn://svn.tartarus.org/snowball/trunk/data"/>
<property name="vocab.dir" value="src/test/org/apache/lucene/analysis/snowball"/>
<target name="download-vocab-tests" depends="compile-core"
description="Downloads Snowball vocabulary tests">
<sequential>
<mkdir dir="${vocab.dir}"/>
<exec dir="${vocab.dir}" executable="${svn.exe}"
failifexecutionfails="false">
<arg line="checkout -r ${snowball.vocab.rev} ${snowball.vocab.url}"/>
</exec>
</sequential>
</target>
</project> </project>

View File

@ -0,0 +1,98 @@
package org.apache.lucene.analysis.snowball;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.KeywordTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
/**
* Test the snowball filters against the snowball data tests
*/
public class TestSnowballVocab extends BaseTokenStreamTestCase {
private Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
static final File dataDir = new File(System.getProperty("dataDir", "./bin"));
static final File dataRoot = new File(dataDir,
"org/apache/lucene/analysis/snowball/data");
/**
* Run all languages against their snowball vocabulary tests.
*/
public void testStemmers() throws IOException {
assertCorrectOutput("Danish", "danish");
assertCorrectOutput("Dutch", "dutch");
assertCorrectOutput("English", "english");
// disabled due to snowball java code generation bug:
// see http://article.gmane.org/gmane.comp.search.snowball/1139
// assertCorrectOutput("Finnish", "finnish");
assertCorrectOutput("French", "french");
assertCorrectOutput("German", "german");
assertCorrectOutput("German2", "german2");
assertCorrectOutput("Hungarian", "hungarian");
assertCorrectOutput("Italian", "italian");
assertCorrectOutput("Kp", "kraaij_pohlmann");
// disabled due to snowball java code generation bug:
// see http://article.gmane.org/gmane.comp.search.snowball/1139
// assertCorrectOutput("Lovins", "lovins");
assertCorrectOutput("Norwegian", "norwegian");
assertCorrectOutput("Porter", "porter");
assertCorrectOutput("Portuguese", "portuguese");
assertCorrectOutput("Romanian", "romanian");
assertCorrectOutput("Russian", "russian");
assertCorrectOutput("Spanish", "spanish");
assertCorrectOutput("Swedish", "swedish");
assertCorrectOutput("Turkish", "turkish");
}
/**
* For the supplied language, run the stemmer against all strings in voc.txt
* The output should be the same as the string in output.txt
*/
private void assertCorrectOutput(String snowballLanguage, String dataDirectory)
throws IOException {
System.err.println("checking snowball language: " + snowballLanguage);
TokenStream filter = new SnowballFilter(tokenizer, snowballLanguage);
InputStream vocFile = new FileInputStream(new File(dataRoot,
dataDirectory + "/voc.txt"));
InputStream outputFile = new FileInputStream(new File(dataRoot,
dataDirectory + "/output.txt"));
BufferedReader vocReader = new BufferedReader(new InputStreamReader(
vocFile, "UTF-8"));
BufferedReader outputReader = new BufferedReader(new InputStreamReader(
outputFile, "UTF-8"));
String inputWord = null;
while ((inputWord = vocReader.readLine()) != null) {
String expectedWord = outputReader.readLine();
assertNotNull(expectedWord);
tokenizer.reset(new StringReader(inputWord));
filter.reset();
assertTokenStreamContents(filter, new String[] {expectedWord});
}
vocReader.close();
outputReader.close();
}
}