LUCENE-3919: fix czechstemmer aioobe on the empty term

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1305177 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-03-25 23:40:44 +00:00
parent cb1a9a0cdf
commit 35705cc396
67 changed files with 917 additions and 5 deletions

View File

@ -44,7 +44,9 @@ public class CzechStemmer {
public int stem(char s[], int len) {
len = removeCase(s, len);
len = removePossessives(s, len);
len = normalize(s, len);
if (len > 0) {
len = normalize(s, len);
}
return len;
}

View File

@ -18,9 +18,13 @@ package org.apache.lucene.analysis.ar;
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/**
* Test the Arabic Normalization Filter
@ -89,4 +93,15 @@ public class TestArabicNormalizationFilter extends BaseTokenStreamTestCase {
assertTokenStreamContents(filter, new String[]{expected});
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new ArabicNormalizationFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -18,9 +18,13 @@ package org.apache.lucene.analysis.ar;
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
@ -128,4 +132,15 @@ public class TestArabicStemFilter extends BaseTokenStreamTestCase {
ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
assertTokenStreamContents(filter, new String[]{expected});
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new ArabicStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -18,10 +18,14 @@ package org.apache.lucene.analysis.bg;
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
@ -221,4 +225,15 @@ public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
new KeywordMarkerFilter(tokenStream, set));
assertTokenStreamContents(filter, new String[] { "строй", "строеве" });
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new BulgarianStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -18,11 +18,13 @@ package org.apache.lucene.analysis.br;
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Collections;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
@ -162,4 +164,15 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, new BrazilianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new BrazilianStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -23,13 +23,13 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
@ -277,4 +277,15 @@ public class TestCJKAnalyzer extends BaseTokenStreamTestCase {
public void testRandomHugeStrings() throws Exception {
checkRandomData(random, new CJKAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new CJKBigramFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/**
* Tests for {@link CJKWidthFilter}
@ -64,4 +65,15 @@ public class TestCJKWidthFilter extends BaseTokenStreamTestCase {
public void testRandomData() throws IOException {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new CJKWidthFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -32,6 +32,7 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@ -361,4 +362,30 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
};
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws Exception {
final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict));
}
};
checkOneTermReuse(a, "", "");
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
Analyzer b = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
TokenFilter filter = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, hyphenator);
return new TokenStreamComponents(tokenizer, filter);
}
};
checkOneTermReuse(b, "", "");
}
}

View File

@ -18,10 +18,14 @@ package org.apache.lucene.analysis.cz;
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
@ -282,4 +286,15 @@ public class TestCzechStemmer extends BaseTokenStreamTestCase {
assertTokenStreamContents(filter, new String[] { "hole", "desk" });
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new CzechStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -49,4 +50,15 @@ public class TestGermanLightStemFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new GermanLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -61,4 +62,15 @@ public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new GermanMinimalStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -25,6 +25,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/**
* Tests {@link GermanNormalizationFilter}
@ -65,4 +66,15 @@ public class TestGermanNormalizationFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new GermanNormalizationFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -17,6 +17,7 @@ package org.apache.lucene.analysis.de;
* limitations under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
@ -61,4 +62,15 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new GermanStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -17,8 +17,13 @@ package org.apache.lucene.analysis.el;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
public class TestGreekStemmer extends BaseTokenStreamTestCase {
Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
@ -522,4 +527,15 @@ public class TestGreekStemmer extends BaseTokenStreamTestCase {
checkOneTerm(a, "αρχοντασ", "αρχοντ");
checkOneTerm(a, "αρχοντων", "αρχοντ");
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new GreekStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/**
* Simple tests for {@link EnglishMinimalStemFilter}
@ -55,4 +56,15 @@ public class TestEnglishMinimalStemFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new EnglishMinimalStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -19,12 +19,14 @@ package org.apache.lucene.analysis.en;
import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/**
* Tests for {@link KStemmer}
@ -52,6 +54,17 @@ public class TestKStemmer extends BaseTokenStreamTestCase {
assertVocabulary(a, getDataFile("kstemTestData.zip"), "kstem_examples.txt");
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new KStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
/****** requires original java kstem source code to create map
public void testCreateMap() throws Exception {
String input = getBigDoc();

View File

@ -22,6 +22,7 @@ import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.Analyzer;
@ -64,4 +65,15 @@ public class TestPorterStemFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new PorterStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -49,4 +50,15 @@ public class TestSpanishLightStemFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new SpanishLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -18,10 +18,14 @@ package org.apache.lucene.analysis.fa;
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/**
* Test the Persian Normalization Filter
@ -61,4 +65,15 @@ public class TestPersianNormalizationFilter extends BaseTokenStreamTestCase {
assertTokenStreamContents(filter, new String[]{expected});
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new PersianNormalizationFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -49,4 +50,15 @@ public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new FinnishLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -18,13 +18,16 @@ package org.apache.lucene.analysis.fr;
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
@ -54,4 +57,15 @@ public class TestElision extends BaseTokenStreamTestCase {
return tas;
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new ElisionFilter(TEST_VERSION_CURRENT, tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -163,4 +164,15 @@ public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new FrenchLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -63,4 +64,15 @@ public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new FrenchMinimalStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -17,11 +17,16 @@ package org.apache.lucene.analysis.ga;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/**
* Test the Irish lowercase filter.
@ -38,4 +43,15 @@ public class TestIrishLowerCaseFilter extends BaseTokenStreamTestCase {
assertTokenStreamContents(filter, new String[] {"n-athair", "t-uisce",
"hard",});
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new IrishLowerCaseFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -17,12 +17,14 @@ package org.apache.lucene.analysis.gl;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/**
* Simple tests for {@link GalicianMinimalStemmer}
@ -52,4 +54,15 @@ public class TestGalicianMinimalStemFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new GalicianMinimalStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -26,6 +26,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
@ -48,4 +49,15 @@ public class TestGalicianStemFilter extends BaseTokenStreamTestCase {
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("gltestdata.zip"), "gl.txt");
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new GalicianStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -18,12 +18,15 @@ package org.apache.lucene.analysis.hi;
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/**
* Test HindiNormalizer
@ -63,4 +66,15 @@ public class TestHindiNormalizer extends BaseTokenStreamTestCase {
TokenFilter tf = new HindiNormalizationFilter(tokenizer);
assertTokenStreamContents(tf, new String[] { output });
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new HindiNormalizationFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -18,12 +18,15 @@ package org.apache.lucene.analysis.hi;
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/**
* Test HindiStemmer
@ -85,4 +88,15 @@ public class TestHindiStemmer extends BaseTokenStreamTestCase {
TokenFilter tf = new HindiStemFilter(tokenizer);
assertTokenStreamContents(tf, new String[] { output });
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new HindiStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -44,4 +45,15 @@ public class TestHungarianLightStemFilter extends BaseTokenStreamTestCase {
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("hulighttestdata.zip"), "hulight.txt");
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new HungarianLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -27,6 +27,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.junit.BeforeClass;
@ -73,4 +74,15 @@ public class HunspellStemFilterTest extends BaseTokenStreamTestCase {
};
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -132,4 +132,15 @@ public class TestIndonesianStemmer extends BaseTokenStreamTestCase {
checkOneTermReuse(a, "bukukah", "buku");
checkOneTermReuse(a, "gigi", "gigi");
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -18,12 +18,15 @@ package org.apache.lucene.analysis.in;
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/**
* Test IndicNormalizer
@ -48,4 +51,15 @@ public class TestIndicNormalizer extends BaseTokenStreamTestCase {
TokenFilter tf = new IndicNormalizationFilter(tokenizer);
assertTokenStreamContents(tf, new String[] { output });
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new IndicNormalizationFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -49,4 +50,15 @@ public class TestItalianLightStemFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new ItalianLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/**
* Basic tests for {@link LatvianStemmer}
@ -268,4 +269,15 @@ public class TestLatvianStemmer extends BaseTokenStreamTestCase {
checkOneTerm(a, "usa", "usa"); // length
checkOneTerm(a, "60ms", "60ms"); // vowel count
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new LatvianStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -22,8 +22,10 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.List;
@ -1923,4 +1925,15 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new ASCIIFoldingFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -29,6 +29,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.miscellaneous.CapitalizationFilter.*;
@ -133,4 +134,15 @@ public class TestCapitalizationFilter extends BaseTokenStreamTestCase {
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new CapitalizationFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -17,6 +17,7 @@
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
@ -25,6 +26,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/**
* HyphenatedWordsFilter test
@ -74,4 +76,15 @@ public class TestHyphenatedWordsFilter extends BaseTokenStreamTestCase {
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new HyphenatedWordsFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -18,6 +18,10 @@ package org.apache.lucene.analysis.miscellaneous;
*/
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
public class TestLengthFilter extends BaseTokenStreamTestCase {
@ -42,4 +46,15 @@ public class TestLengthFilter extends BaseTokenStreamTestCase {
);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new LengthFilter(true, tokenizer, 0, 5));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -23,6 +23,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@ -31,6 +32,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util._TestUtil;
import java.io.IOException;
import java.io.Reader;
import java.util.Iterator;
import java.util.Arrays;
@ -165,4 +167,15 @@ public class TestRemoveDuplicatesTokenFilter extends BaseTokenStreamTestCase {
}
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -27,6 +27,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.tokenattributes.*;
/**
@ -130,4 +131,15 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
};
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, random.nextBoolean()));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -18,7 +18,10 @@
package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.cz.CzechStemFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@ -322,4 +325,26 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
}
public void testEmptyTerm() throws IOException {
for (int i = 0; i < 512; i++) {
final int flags = i;
final CharArraySet protectedWords;
if (random.nextBoolean()) {
protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<String>(Arrays.asList("a", "b", "cd")), false);
} else {
protectedWords = null;
}
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
}
};
// depending upon options, this thing may or may not preserve the empty term
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
}
}
}

View File

@ -23,6 +23,7 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
@ -152,4 +153,26 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
};
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer,
new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.FRONT, 2, 15));
}
};
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
Analyzer b = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer,
new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15));
}
};
checkAnalysisConsistency(random, b, random.nextBoolean(), "");
}
}

View File

@ -23,7 +23,7 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
@ -132,4 +132,16 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer,
new NGramTokenFilter(tokenizer, 2, 15));
}
};
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
}
}

View File

@ -25,6 +25,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -50,4 +51,15 @@ public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new NorwegianLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -25,6 +25,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -50,4 +51,15 @@ public class TestNorwegianMinimalStemFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new NorwegianMinimalStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -22,7 +22,9 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.regex.Pattern;
@ -104,4 +106,15 @@ public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new PatternReplaceFilter(tokenizer, Pattern.compile("a"), "b", true));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
@ -96,4 +97,15 @@ public class TestPortugueseLightStemFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new PortugueseLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
@ -70,4 +71,15 @@ public class TestPortugueseMinimalStemFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new PortugueseMinimalStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -26,6 +26,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
@ -70,4 +71,15 @@ public class TestPortugueseStemFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new PortugueseStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -17,6 +17,7 @@
package org.apache.lucene.analysis.reverse;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
@ -25,6 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.util.Version;
public class TestReverseStringFilter extends BaseTokenStreamTestCase {
@ -111,4 +113,15 @@ public class TestReverseStringFilter extends BaseTokenStreamTestCase {
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(TEST_VERSION_CURRENT, tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -49,4 +50,15 @@ public class TestRussianLightStemFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new RussianLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -27,6 +27,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.*;
@ -1156,4 +1157,15 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
};
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new ShingleFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -17,10 +17,15 @@ package org.apache.lucene.analysis.snowball;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.index.Payload;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
@ -136,4 +141,23 @@ public class TestSnowball extends BaseTokenStreamTestCase {
return true;
}
}
public void testEmptyTerm() throws IOException {
String langs[] = {
"Armenian", "Basque", "Catalan", "Danish", "Dutch", "English",
"Finnish", "French", "German2", "German", "Hungarian", "Irish",
"Italian", "Kp", "Lovins", "Norwegian", "Porter", "Portuguese",
"Romanian", "Russian", "Spanish", "Swedish", "Turkish"
};
for (final String lang : langs) {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new SnowballFilter(tokenizer, lang));
}
};
checkOneTermReuse(a, "", "");
}
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -49,4 +50,15 @@ public class TestSwedishLightStemFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new SwedishLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -17,6 +17,7 @@
package org.apache.lucene.analysis.synonym;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
@ -32,6 +33,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util._TestUtil;
@ -428,6 +430,29 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
}
}
public void testEmptyTerm() throws IOException {
final int numIters = atLeast(10);
for (int i = 0; i < numIters; i++) {
b = new SynonymMap.Builder(random.nextBoolean());
final int numEntries = atLeast(10);
for (int j = 0; j < numEntries; j++) {
add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
}
final SynonymMap map = b.build();
final boolean ignoreCase = random.nextBoolean();
final Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase));
}
};
checkAnalysisConsistency(random, analyzer, random.nextBoolean(), "");
}
}
/** simple random test like testRandom2, but for large docs
*/
public void testRandomHuge() throws Exception {

View File

@ -17,10 +17,15 @@ package org.apache.lucene.analysis.th;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
@ -183,4 +188,15 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
ts.addAttribute(FlagsAttribute.class);
assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" });
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new ThaiWordFilter(TEST_VERSION_CURRENT, tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -17,11 +17,16 @@ package org.apache.lucene.analysis.tr;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/**
* Test the Turkish lowercase filter.
@ -62,4 +67,15 @@ public class TestTurkishLowerCaseFilter extends BaseTokenStreamTestCase {
assertTokenStreamContents(filter, new String[] {"i\u0316stanbul", "izmir",
"\u0131\u0316sparta",});
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new TurkishLowerCaseFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/**
* Tests ICUFoldingFilter
@ -77,4 +78,15 @@ public class TestICUFoldingFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new ICUFoldingFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import com.ibm.icu.text.Normalizer2;
@ -77,4 +78,15 @@ public class TestICUNormalizer2Filter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -100,4 +100,15 @@ public class TestICUTransformFilter extends BaseTokenStreamTestCase {
};
checkRandomData(random, a, 1000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, Transliterator.getInstance("Any-Latin")));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -23,6 +23,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
public class TestKuromojiBaseFormFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@ -47,4 +48,15 @@ public class TestKuromojiBaseFormFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws IOException {
checkRandomData(random, analyzer, atLeast(10000));
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new KuromojiBaseFormFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import java.io.IOException;
import java.io.Reader;
@ -68,4 +69,15 @@ public class TestKuromojiKatakanaStemFilter extends BaseTokenStreamTestCase {
public void testRandomData() throws IOException {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new KuromojiKatakanaStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.kuromoji;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import java.io.IOException;
import java.io.Reader;
@ -61,4 +62,15 @@ public class TestKuromojiReadingFormFilter extends BaseTokenStreamTestCase {
checkRandomData(random, katakanaAnalyzer, 1000*RANDOM_MULTIPLIER);
checkRandomData(random, romajiAnalyzer, 1000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new KuromojiReadingFormFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -16,6 +16,7 @@
*/
package org.apache.lucene.analysis.phonetic;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
@ -24,7 +25,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.util._TestUtil;
@ -95,4 +96,15 @@ public class DoubleMetaphoneFilterTest extends BaseTokenStreamTestCase {
};
checkRandomData(random, b, 1000 * RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, 8, random.nextBoolean()));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -17,6 +17,7 @@ package org.apache.lucene.analysis.phonetic;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.HashSet;
@ -28,6 +29,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.junit.Ignore;
/** Tests {@link BeiderMorseFilter} */
@ -91,4 +93,15 @@ public class TestBeiderMorseFilter extends BaseTokenStreamTestCase {
public void testRandom() throws Exception {
checkRandomData(random, analyzer, 1000 * RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new BeiderMorseFilter(tokenizer, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true)));
}
};
checkOneTermReuse(a, "", "");
}
}

View File

@ -31,6 +31,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
@ -103,4 +104,20 @@ public class TestPhoneticFilter extends BaseTokenStreamTestCase {
checkRandomData(random, b, 1000*RANDOM_MULTIPLIER);
}
}
public void testEmptyTerm() throws IOException {
Encoder encoders[] = new Encoder[] {
new Metaphone(), new DoubleMetaphone(), new Soundex(), new RefinedSoundex(), new Caverphone()
};
for (final Encoder e : encoders) {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, random.nextBoolean()));
}
};
checkOneTermReuse(a, "", "");
}
}
}

View File

@ -17,6 +17,7 @@
package org.apache.lucene.analysis.cn.smart;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
@ -26,6 +27,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.util.Version;
@ -228,4 +230,15 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
public void testRandomHugeStrings() throws Exception {
checkRandomData(random, new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new WordTokenFilter(tokenizer));
}
};
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
}
}