mirror of https://github.com/apache/lucene.git
LUCENE-2560: add basic stress tests for analyzers
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1096178 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2714ba90ca
commit
68061ef921
|
@ -19,11 +19,15 @@ package org.apache.lucene.analysis;
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.*;
|
import org.apache.lucene.analysis.tokenattributes.*;
|
||||||
import org.apache.lucene.util.Attribute;
|
import org.apache.lucene.util.Attribute;
|
||||||
import org.apache.lucene.util.AttributeImpl;
|
import org.apache.lucene.util.AttributeImpl;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Base class for all Lucene unit tests that use TokenStreams.
|
* Base class for all Lucene unit tests that use TokenStreams.
|
||||||
|
@ -229,4 +233,39 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
assertAnalyzesToReuse(a, input, new String[]{expected});
|
assertAnalyzesToReuse(a, input, new String[]{expected});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// simple utility method for blasting tokenstreams with data to make sure they don't do anything crazy
|
||||||
|
|
||||||
|
public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException {
|
||||||
|
checkRandomData(random, a, iterations, 20);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
|
||||||
|
for (int i = 0; i < iterations; i++) {
|
||||||
|
String text;
|
||||||
|
switch(_TestUtil.nextInt(random, 0, 3)) {
|
||||||
|
case 0:
|
||||||
|
text = _TestUtil.randomSimpleString(random);
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
text = _TestUtil.randomRealisticUnicodeString(random, maxWordLength);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
text = _TestUtil.randomUnicodeString(random, maxWordLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
TokenStream ts = a.reusableTokenStream("dummy", new StringReader(text));
|
||||||
|
assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
|
||||||
|
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
|
||||||
|
List<String> tokens = new ArrayList<String>();
|
||||||
|
ts.reset();
|
||||||
|
while (ts.incrementToken()) {
|
||||||
|
tokens.add(termAtt.toString());
|
||||||
|
// TODO: we could collect offsets etc here for better checking that reset() really works.
|
||||||
|
}
|
||||||
|
ts.close();
|
||||||
|
// verify reusing is "reproducable" and also get the normal tokenstream sanity checks
|
||||||
|
if (!tokens.isEmpty())
|
||||||
|
assertAnalyzesToReuse(a, text, tokens.toArray(new String[tokens.size()]));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -111,4 +111,8 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
|
||||||
assertAnalyzesToReuse(analyzer, testString, new String[] { "t" });
|
assertAnalyzesToReuse(analyzer, testString, new String[] { "t" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new MockAnalyzer(random), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -98,4 +98,9 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
|
||||||
assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
|
assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
|
||||||
assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
|
assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new ArabicAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -75,4 +75,9 @@ public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
||||||
assertAnalyzesTo(a, "строевете строеве", new String[] { "строй", "строеве" });
|
assertAnalyzesTo(a, "строевете строеве", new String[] { "строй", "строеве" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new BulgarianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -157,4 +157,8 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, input, expected);
|
checkOneTermReuse(a, input, expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new BrazilianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
|
@ -50,4 +50,9 @@ public class TestCatalanAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "llengües", "llengües");
|
checkOneTermReuse(a, "llengües", "llengües");
|
||||||
checkOneTermReuse(a, "llengua", "llengu");
|
checkOneTermReuse(a, "llengua", "llengu");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new CatalanAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -270,4 +270,9 @@ public class TestCJKTokenizer extends BaseTokenStreamTestCase {
|
||||||
newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||||
newToken("あい", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
|
newToken("あい", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new CJKAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -210,6 +210,13 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
|
||||||
assertTokenStreamContents(tokenizer, new String[] { "Tokenizer",
|
assertTokenStreamContents(tokenizer, new String[] { "Tokenizer",
|
||||||
"\ud801\udc1ctest" });
|
"\ud801\udc1ctest" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
checkRandomData(random, new SimpleAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
checkRandomData(random, new StopAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final class PayloadSetter extends TokenFilter {
|
final class PayloadSetter extends TokenFilter {
|
||||||
|
|
|
@ -309,4 +309,9 @@ public class TestClassicAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new ClassicAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -102,4 +102,9 @@ public class TestKeywordAnalyzer extends BaseTokenStreamTestCase {
|
||||||
assertEquals(0, offsetAtt.startOffset());
|
assertEquals(0, offsetAtt.startOffset());
|
||||||
assertEquals(4, offsetAtt.endOffset());
|
assertEquals(4, offsetAtt.endOffset());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new KeywordAnalyzer(), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@ package org.apache.lucene.analysis.core;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
|
@ -219,4 +220,9 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
||||||
new String[] { "仮", "名", "遣", "い", "カタカナ" },
|
new String[] { "仮", "名", "遣", "い", "カタカナ" },
|
||||||
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
|
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new StandardAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -418,4 +418,9 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
||||||
new String[] { "仮", "名", "遣", "い", "カタカナ" },
|
new String[] { "仮", "名", "遣", "い", "カタカナ" },
|
||||||
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
|
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -67,4 +67,9 @@ public class TestCzechAnalyzer extends BaseTokenStreamTestCase {
|
||||||
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
||||||
assertAnalyzesTo(cz, "hole desek", new String[] {"hole", "desk"});
|
assertAnalyzesTo(cz, "hole desek", new String[] {"hole", "desk"});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new CzechAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,4 +50,9 @@ public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "undersøgelse", "undersøgelse");
|
checkOneTermReuse(a, "undersøgelse", "undersøgelse");
|
||||||
checkOneTermReuse(a, "undersøg", "undersøg");
|
checkOneTermReuse(a, "undersøg", "undersøg");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new DanishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -63,4 +63,9 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "Schaltflächen", "schaltflach");
|
checkOneTermReuse(a, "Schaltflächen", "schaltflach");
|
||||||
checkOneTermReuse(a, "Schaltflaechen", "schaltflaech");
|
checkOneTermReuse(a, "Schaltflaechen", "schaltflaech");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new GermanAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.hu.HungarianAnalyzer;
|
||||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
|
@ -45,4 +46,9 @@ public class TestGermanLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
public void testVocabulary() throws IOException {
|
public void testVocabulary() throws IOException {
|
||||||
assertVocabulary(analyzer, getDataFile("delighttestdata.zip"), "delight.txt");
|
assertVocabulary(analyzer, getDataFile("delighttestdata.zip"), "delight.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -57,4 +57,9 @@ public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
public void testVocabulary() throws IOException {
|
public void testVocabulary() throws IOException {
|
||||||
assertVocabulary(analyzer, getDataFile("deminimaltestdata.zip"), "deminimal.txt");
|
assertVocabulary(analyzer, getDataFile("deminimaltestdata.zip"), "deminimal.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
import org.junit.Ignore;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
|
|
||||||
|
@ -36,8 +37,6 @@ import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testStemming() throws Exception {
|
|
||||||
Analyzer analyzer = new ReusableAnalyzerBase() {
|
Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
@ -48,8 +47,15 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
public void testStemming() throws Exception {
|
||||||
InputStream vocOut = getClass().getResourceAsStream("data.txt");
|
InputStream vocOut = getClass().getResourceAsStream("data.txt");
|
||||||
assertVocabulary(analyzer, vocOut);
|
assertVocabulary(analyzer, vocOut);
|
||||||
vocOut.close();
|
vocOut.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
@Ignore("bugs!")
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -87,4 +87,9 @@ public class GreekAnalyzerTest extends BaseTokenStreamTestCase {
|
||||||
assertAnalyzesToReuse(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
|
assertAnalyzesToReuse(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
|
||||||
new String[] { "προυποθεσ", "αψογ", "μεστ", "αλλ" });
|
new String[] { "προυποθεσ", "αψογ", "μεστ", "αλλ" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new GreekAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -52,4 +52,9 @@ public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "books", "books");
|
checkOneTermReuse(a, "books", "books");
|
||||||
checkOneTermReuse(a, "book", "book");
|
checkOneTermReuse(a, "book", "book");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new EnglishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,4 +51,9 @@ public class TestEnglishMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
checkOneTerm(analyzer, "congress", "congress");
|
checkOneTerm(analyzer, "congress", "congress");
|
||||||
checkOneTerm(analyzer, "serious", "serious");
|
checkOneTerm(analyzer, "serious", "serious");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,11 +37,6 @@ import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
* Test the PorterStemFilter with Martin Porter's test data.
|
* Test the PorterStemFilter with Martin Porter's test data.
|
||||||
*/
|
*/
|
||||||
public class TestPorterStemFilter extends BaseTokenStreamTestCase {
|
public class TestPorterStemFilter extends BaseTokenStreamTestCase {
|
||||||
/**
|
|
||||||
* Run the stemmer against all strings in voc.txt
|
|
||||||
* The output should be the same as the string in output.txt
|
|
||||||
*/
|
|
||||||
public void testPorterStemFilter() throws Exception {
|
|
||||||
Analyzer a = new ReusableAnalyzerBase() {
|
Analyzer a = new ReusableAnalyzerBase() {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
@ -51,6 +46,11 @@ public class TestPorterStemFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run the stemmer against all strings in voc.txt
|
||||||
|
* The output should be the same as the string in output.txt
|
||||||
|
*/
|
||||||
|
public void testPorterStemFilter() throws Exception {
|
||||||
assertVocabulary(a, getDataFile("porterTestData.zip"), "voc.txt", "output.txt");
|
assertVocabulary(a, getDataFile("porterTestData.zip"), "voc.txt", "output.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -61,4 +61,9 @@ public class TestPorterStemFilter extends BaseTokenStreamTestCase {
|
||||||
TokenStream filter = new PorterStemFilter(new KeywordMarkerFilter(tokenizer, set));
|
TokenStream filter = new PorterStemFilter(new KeywordMarkerFilter(tokenizer, set));
|
||||||
assertTokenStreamContents(filter, new String[] {"yourselves", "your"});
|
assertTokenStreamContents(filter, new String[] {"yourselves", "your"});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,4 +50,9 @@ public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "chicana", "chican");
|
checkOneTermReuse(a, "chicana", "chican");
|
||||||
checkOneTermReuse(a, "chicano", "chicano");
|
checkOneTermReuse(a, "chicano", "chicano");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new SpanishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,4 +45,9 @@ public class TestSpanishLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
public void testVocabulary() throws IOException {
|
public void testVocabulary() throws IOException {
|
||||||
assertVocabulary(analyzer, getDataFile("eslighttestdata.zip"), "eslight.txt");
|
assertVocabulary(analyzer, getDataFile("eslighttestdata.zip"), "eslight.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,4 +50,9 @@ public class TestBasqueAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "zaldiak", "zaldiak");
|
checkOneTermReuse(a, "zaldiak", "zaldiak");
|
||||||
checkOneTermReuse(a, "mendiari", "mendi");
|
checkOneTermReuse(a, "mendiari", "mendi");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new BasqueAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -219,4 +219,9 @@ public class TestPersianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
|
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
|
||||||
"brown", "fox" });
|
"brown", "fox" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new PersianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,4 +50,9 @@ public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
|
checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
|
||||||
checkOneTermReuse(a, "edeltäjistään", "edeltäjistään");
|
checkOneTermReuse(a, "edeltäjistään", "edeltäjistään");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new FinnishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,4 +45,9 @@ public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
public void testVocabulary() throws IOException {
|
public void testVocabulary() throws IOException {
|
||||||
assertVocabulary(analyzer, getDataFile("filighttestdata.zip"), "filight.txt");
|
assertVocabulary(analyzer, getDataFile("filighttestdata.zip"), "filight.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -260,4 +260,9 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
|
||||||
FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_31);
|
FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_31);
|
||||||
assertAnalyzesTo(a, "Votre", new String[] { });
|
assertAnalyzesTo(a, "Votre", new String[] { });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new FrenchAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -159,4 +159,9 @@ public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
public void testVocabulary() throws IOException {
|
public void testVocabulary() throws IOException {
|
||||||
assertVocabulary(analyzer, getDataFile("frlighttestdata.zip"), "frlight.txt");
|
assertVocabulary(analyzer, getDataFile("frlighttestdata.zip"), "frlight.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -59,4 +59,9 @@ public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
public void testVocabulary() throws IOException {
|
public void testVocabulary() throws IOException {
|
||||||
assertVocabulary(analyzer, getDataFile("frminimaltestdata.zip"), "frminimal.txt");
|
assertVocabulary(analyzer, getDataFile("frminimaltestdata.zip"), "frminimal.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,4 +50,9 @@ public class TestGalicianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "correspondente", "correspondente");
|
checkOneTermReuse(a, "correspondente", "correspondente");
|
||||||
checkOneTermReuse(a, "corresponderá", "correspond");
|
checkOneTermReuse(a, "corresponderá", "correspond");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new GalicianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -47,4 +47,9 @@ public class TestHindiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
HindiAnalyzer.getDefaultStopSet(), exclusionSet);
|
HindiAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "हिंदी", "हिंदी");
|
checkOneTermReuse(a, "हिंदी", "हिंदी");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new HindiAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,4 +50,9 @@ public class TestHungarianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "babakocsi", "babakocsi");
|
checkOneTermReuse(a, "babakocsi", "babakocsi");
|
||||||
checkOneTermReuse(a, "babakocsijáért", "babakocs");
|
checkOneTermReuse(a, "babakocsijáért", "babakocs");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new HungarianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,4 +50,9 @@ public class TestArmenianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "արծիվներ", "արծիվներ");
|
checkOneTermReuse(a, "արծիվներ", "արծիվներ");
|
||||||
checkOneTermReuse(a, "արծիվ", "արծ");
|
checkOneTermReuse(a, "արծիվ", "արծ");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new ArmenianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,4 +50,9 @@ public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "peledakan", "peledakan");
|
checkOneTermReuse(a, "peledakan", "peledakan");
|
||||||
checkOneTermReuse(a, "pembunuhan", "bunuh");
|
checkOneTermReuse(a, "pembunuhan", "bunuh");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new IndonesianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,4 +50,9 @@ public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "abbandonata", "abbandonata");
|
checkOneTermReuse(a, "abbandonata", "abbandonata");
|
||||||
checkOneTermReuse(a, "abbandonati", "abbandon");
|
checkOneTermReuse(a, "abbandonati", "abbandon");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new ItalianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,4 +45,9 @@ public class TestItalianLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
public void testVocabulary() throws IOException {
|
public void testVocabulary() throws IOException {
|
||||||
assertVocabulary(analyzer, getDataFile("itlighttestdata.zip"), "itlight.txt");
|
assertVocabulary(analyzer, getDataFile("itlighttestdata.zip"), "itlight.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,4 +50,9 @@ public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "tirgiem", "tirgiem");
|
checkOneTermReuse(a, "tirgiem", "tirgiem");
|
||||||
checkOneTermReuse(a, "tirgus", "tirg");
|
checkOneTermReuse(a, "tirgus", "tirg");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new LatvianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -185,4 +185,9 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
||||||
checkOneTerm(new DutchAnalyzer(TEST_VERSION_CURRENT), input, expected);
|
checkOneTerm(new DutchAnalyzer(TEST_VERSION_CURRENT), input, expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new DutchAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
|
@ -50,4 +50,9 @@ public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "havnedistriktene", "havnedistriktene");
|
checkOneTermReuse(a, "havnedistriktene", "havnedistriktene");
|
||||||
checkOneTermReuse(a, "havnedistrikter", "havnedistrikt");
|
checkOneTermReuse(a, "havnedistrikter", "havnedistrikt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new NorwegianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,4 +50,9 @@ public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "quilométricas", "quilométricas");
|
checkOneTermReuse(a, "quilométricas", "quilométricas");
|
||||||
checkOneTermReuse(a, "quilométricos", "quilométr");
|
checkOneTermReuse(a, "quilométricos", "quilométr");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new PortugueseAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -92,4 +92,9 @@ public class TestPortugueseLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
public void testVocabulary() throws IOException {
|
public void testVocabulary() throws IOException {
|
||||||
assertVocabulary(analyzer, getDataFile("ptlighttestdata.zip"), "ptlight.txt");
|
assertVocabulary(analyzer, getDataFile("ptlighttestdata.zip"), "ptlight.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -66,4 +66,9 @@ public class TestPortugueseMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
public void testVocabulary() throws IOException {
|
public void testVocabulary() throws IOException {
|
||||||
assertVocabulary(analyzer, getDataFile("ptminimaltestdata.zip"), "ptminimal.txt");
|
assertVocabulary(analyzer, getDataFile("ptminimaltestdata.zip"), "ptminimal.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -66,4 +66,9 @@ public class TestPortugueseStemFilter extends BaseTokenStreamTestCase {
|
||||||
public void testVocabulary() throws IOException {
|
public void testVocabulary() throws IOException {
|
||||||
assertVocabulary(analyzer, getDataFile("ptrslptestdata.zip"), "ptrslp.txt");
|
assertVocabulary(analyzer, getDataFile("ptrslptestdata.zip"), "ptrslp.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,4 +50,9 @@ public class TestRomanianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "absenţa", "absenţa");
|
checkOneTermReuse(a, "absenţa", "absenţa");
|
||||||
checkOneTermReuse(a, "absenţi", "absenţ");
|
checkOneTermReuse(a, "absenţi", "absenţ");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new RomanianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -64,4 +64,9 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представление" });
|
new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представление" });
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new RussianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,4 +45,9 @@ public class TestRussianLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
public void testVocabulary() throws IOException {
|
public void testVocabulary() throws IOException {
|
||||||
assertVocabulary(analyzer, getDataFile("rulighttestdata.zip"), "rulight.txt");
|
assertVocabulary(analyzer, getDataFile("rulighttestdata.zip"), "rulight.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,6 +23,7 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.hu.HungarianAnalyzer;
|
||||||
|
|
||||||
public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
|
public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
|
@ -50,4 +51,9 @@ public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "jaktkarlarne", "jaktkarlarne");
|
checkOneTermReuse(a, "jaktkarlarne", "jaktkarlarne");
|
||||||
checkOneTermReuse(a, "jaktkarlens", "jaktkarl");
|
checkOneTermReuse(a, "jaktkarlens", "jaktkarl");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new SwedishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,4 +45,9 @@ public class TestSwedishLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
public void testVocabulary() throws IOException {
|
public void testVocabulary() throws IOException {
|
||||||
assertVocabulary(analyzer, getDataFile("svlighttestdata.zip"), "svlight.txt");
|
assertVocabulary(analyzer, getDataFile("svlighttestdata.zip"), "svlight.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -143,4 +143,9 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
|
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
|
||||||
new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
|
new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new ThaiAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,4 +50,9 @@ public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "ağacı", "ağacı");
|
checkOneTermReuse(a, "ağacı", "ağacı");
|
||||||
checkOneTermReuse(a, "ağaç", "ağaç");
|
checkOneTermReuse(a, "ağaç", "ağaç");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new TurkishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -195,4 +195,9 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
||||||
while (stream.incrementToken()) {
|
while (stream.incrementToken()) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,4 +50,9 @@ public class TestPolishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "studenta", "studenta");
|
checkOneTermReuse(a, "studenta", "studenta");
|
||||||
checkOneTermReuse(a, "studenci", "student");
|
checkOneTermReuse(a, "studenci", "student");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random, new PolishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue