mirror of https://github.com/apache/lucene.git
LUCENE-2203: use the snowball vocabulary tests for improved testing
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@898950 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7bdac8e555
commit
9da810e7dd
|
@ -131,10 +131,26 @@
|
|||
</target>
|
||||
|
||||
<target name="compile-core" depends="build-analyzers, common.compile-core" />
|
||||
<target name="compile-test" depends="download-vocab-tests, common.compile-test" />
|
||||
|
||||
<target name="build-analyzers" unless="analyzers.jar.present">
|
||||
<echo>Snowball building dependency ${analyzers.jar}</echo>
|
||||
<ant antfile="../analyzers/build.xml" target="default" inheritall="false" dir="../analyzers" />
|
||||
</target>
|
||||
|
||||
<property name="snowball.vocab.rev" value="500"/>
|
||||
<property name="snowball.vocab.url"
|
||||
value="svn://svn.tartarus.org/snowball/trunk/data"/>
|
||||
<property name="vocab.dir" value="src/test/org/apache/lucene/analysis/snowball"/>
|
||||
|
||||
<target name="download-vocab-tests" depends="compile-core"
|
||||
description="Downloads Snowball vocabulary tests">
|
||||
<sequential>
|
||||
<mkdir dir="${vocab.dir}"/>
|
||||
<exec dir="${vocab.dir}" executable="${svn.exe}"
|
||||
failifexecutionfails="false">
|
||||
<arg line="checkout -r ${snowball.vocab.rev} ${snowball.vocab.url}"/>
|
||||
</exec>
|
||||
</sequential>
|
||||
</target>
|
||||
</project>
|
||||
|
|
|
@ -0,0 +1,98 @@
|
|||
package org.apache.lucene.analysis.snowball;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/**
|
||||
* Test the snowball filters against the snowball data tests
|
||||
*/
|
||||
public class TestSnowballVocab extends BaseTokenStreamTestCase {
|
||||
private Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
|
||||
static final File dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||
static final File dataRoot = new File(dataDir,
|
||||
"org/apache/lucene/analysis/snowball/data");
|
||||
|
||||
/**
|
||||
* Run all languages against their snowball vocabulary tests.
|
||||
*/
|
||||
public void testStemmers() throws IOException {
|
||||
assertCorrectOutput("Danish", "danish");
|
||||
assertCorrectOutput("Dutch", "dutch");
|
||||
assertCorrectOutput("English", "english");
|
||||
// disabled due to snowball java code generation bug:
|
||||
// see http://article.gmane.org/gmane.comp.search.snowball/1139
|
||||
// assertCorrectOutput("Finnish", "finnish");
|
||||
assertCorrectOutput("French", "french");
|
||||
assertCorrectOutput("German", "german");
|
||||
assertCorrectOutput("German2", "german2");
|
||||
assertCorrectOutput("Hungarian", "hungarian");
|
||||
assertCorrectOutput("Italian", "italian");
|
||||
assertCorrectOutput("Kp", "kraaij_pohlmann");
|
||||
// disabled due to snowball java code generation bug:
|
||||
// see http://article.gmane.org/gmane.comp.search.snowball/1139
|
||||
// assertCorrectOutput("Lovins", "lovins");
|
||||
assertCorrectOutput("Norwegian", "norwegian");
|
||||
assertCorrectOutput("Porter", "porter");
|
||||
assertCorrectOutput("Portuguese", "portuguese");
|
||||
assertCorrectOutput("Romanian", "romanian");
|
||||
assertCorrectOutput("Russian", "russian");
|
||||
assertCorrectOutput("Spanish", "spanish");
|
||||
assertCorrectOutput("Swedish", "swedish");
|
||||
assertCorrectOutput("Turkish", "turkish");
|
||||
}
|
||||
|
||||
/**
|
||||
* For the supplied language, run the stemmer against all strings in voc.txt
|
||||
* The output should be the same as the string in output.txt
|
||||
*/
|
||||
private void assertCorrectOutput(String snowballLanguage, String dataDirectory)
|
||||
throws IOException {
|
||||
System.err.println("checking snowball language: " + snowballLanguage);
|
||||
TokenStream filter = new SnowballFilter(tokenizer, snowballLanguage);
|
||||
InputStream vocFile = new FileInputStream(new File(dataRoot,
|
||||
dataDirectory + "/voc.txt"));
|
||||
InputStream outputFile = new FileInputStream(new File(dataRoot,
|
||||
dataDirectory + "/output.txt"));
|
||||
BufferedReader vocReader = new BufferedReader(new InputStreamReader(
|
||||
vocFile, "UTF-8"));
|
||||
BufferedReader outputReader = new BufferedReader(new InputStreamReader(
|
||||
outputFile, "UTF-8"));
|
||||
String inputWord = null;
|
||||
while ((inputWord = vocReader.readLine()) != null) {
|
||||
String expectedWord = outputReader.readLine();
|
||||
assertNotNull(expectedWord);
|
||||
tokenizer.reset(new StringReader(inputWord));
|
||||
filter.reset();
|
||||
assertTokenStreamContents(filter, new String[] {expectedWord});
|
||||
}
|
||||
vocReader.close();
|
||||
outputReader.close();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue