mirror of https://github.com/apache/lucene.git
LUCENE-2203: use the snowball vocabulary tests for improved testing
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@898950 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7bdac8e555
commit
9da810e7dd
|
@ -131,10 +131,26 @@
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="compile-core" depends="build-analyzers, common.compile-core" />
|
<target name="compile-core" depends="build-analyzers, common.compile-core" />
|
||||||
|
<target name="compile-test" depends="download-vocab-tests, common.compile-test" />
|
||||||
|
|
||||||
<target name="build-analyzers" unless="analyzers.jar.present">
|
<target name="build-analyzers" unless="analyzers.jar.present">
|
||||||
<echo>Snowball building dependency ${analyzers.jar}</echo>
|
<echo>Snowball building dependency ${analyzers.jar}</echo>
|
||||||
<ant antfile="../analyzers/build.xml" target="default" inheritall="false" dir="../analyzers" />
|
<ant antfile="../analyzers/build.xml" target="default" inheritall="false" dir="../analyzers" />
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
|
<property name="snowball.vocab.rev" value="500"/>
|
||||||
|
<property name="snowball.vocab.url"
|
||||||
|
value="svn://svn.tartarus.org/snowball/trunk/data"/>
|
||||||
|
<property name="vocab.dir" value="src/test/org/apache/lucene/analysis/snowball"/>
|
||||||
|
|
||||||
|
<target name="download-vocab-tests" depends="compile-core"
|
||||||
|
description="Downloads Snowball vocabulary tests">
|
||||||
|
<sequential>
|
||||||
|
<mkdir dir="${vocab.dir}"/>
|
||||||
|
<exec dir="${vocab.dir}" executable="${svn.exe}"
|
||||||
|
failifexecutionfails="false">
|
||||||
|
<arg line="checkout -r ${snowball.vocab.rev} ${snowball.vocab.url}"/>
|
||||||
|
</exec>
|
||||||
|
</sequential>
|
||||||
|
</target>
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -0,0 +1,98 @@
|
||||||
|
package org.apache.lucene.analysis.snowball;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the snowball filters against the snowball data tests
|
||||||
|
*/
|
||||||
|
public class TestSnowballVocab extends BaseTokenStreamTestCase {
|
||||||
|
private Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
|
||||||
|
static final File dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||||
|
static final File dataRoot = new File(dataDir,
|
||||||
|
"org/apache/lucene/analysis/snowball/data");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run all languages against their snowball vocabulary tests.
|
||||||
|
*/
|
||||||
|
public void testStemmers() throws IOException {
|
||||||
|
assertCorrectOutput("Danish", "danish");
|
||||||
|
assertCorrectOutput("Dutch", "dutch");
|
||||||
|
assertCorrectOutput("English", "english");
|
||||||
|
// disabled due to snowball java code generation bug:
|
||||||
|
// see http://article.gmane.org/gmane.comp.search.snowball/1139
|
||||||
|
// assertCorrectOutput("Finnish", "finnish");
|
||||||
|
assertCorrectOutput("French", "french");
|
||||||
|
assertCorrectOutput("German", "german");
|
||||||
|
assertCorrectOutput("German2", "german2");
|
||||||
|
assertCorrectOutput("Hungarian", "hungarian");
|
||||||
|
assertCorrectOutput("Italian", "italian");
|
||||||
|
assertCorrectOutput("Kp", "kraaij_pohlmann");
|
||||||
|
// disabled due to snowball java code generation bug:
|
||||||
|
// see http://article.gmane.org/gmane.comp.search.snowball/1139
|
||||||
|
// assertCorrectOutput("Lovins", "lovins");
|
||||||
|
assertCorrectOutput("Norwegian", "norwegian");
|
||||||
|
assertCorrectOutput("Porter", "porter");
|
||||||
|
assertCorrectOutput("Portuguese", "portuguese");
|
||||||
|
assertCorrectOutput("Romanian", "romanian");
|
||||||
|
assertCorrectOutput("Russian", "russian");
|
||||||
|
assertCorrectOutput("Spanish", "spanish");
|
||||||
|
assertCorrectOutput("Swedish", "swedish");
|
||||||
|
assertCorrectOutput("Turkish", "turkish");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For the supplied language, run the stemmer against all strings in voc.txt
|
||||||
|
* The output should be the same as the string in output.txt
|
||||||
|
*/
|
||||||
|
private void assertCorrectOutput(String snowballLanguage, String dataDirectory)
|
||||||
|
throws IOException {
|
||||||
|
System.err.println("checking snowball language: " + snowballLanguage);
|
||||||
|
TokenStream filter = new SnowballFilter(tokenizer, snowballLanguage);
|
||||||
|
InputStream vocFile = new FileInputStream(new File(dataRoot,
|
||||||
|
dataDirectory + "/voc.txt"));
|
||||||
|
InputStream outputFile = new FileInputStream(new File(dataRoot,
|
||||||
|
dataDirectory + "/output.txt"));
|
||||||
|
BufferedReader vocReader = new BufferedReader(new InputStreamReader(
|
||||||
|
vocFile, "UTF-8"));
|
||||||
|
BufferedReader outputReader = new BufferedReader(new InputStreamReader(
|
||||||
|
outputFile, "UTF-8"));
|
||||||
|
String inputWord = null;
|
||||||
|
while ((inputWord = vocReader.readLine()) != null) {
|
||||||
|
String expectedWord = outputReader.readLine();
|
||||||
|
assertNotNull(expectedWord);
|
||||||
|
tokenizer.reset(new StringReader(inputWord));
|
||||||
|
filter.reset();
|
||||||
|
assertTokenStreamContents(filter, new String[] {expectedWord});
|
||||||
|
}
|
||||||
|
vocReader.close();
|
||||||
|
outputReader.close();
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue