diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java index 32980cc9056..120b287d820 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java @@ -44,7 +44,9 @@ public class CzechStemmer { public int stem(char s[], int len) { len = removeCase(s, len); len = removePossessives(s, len); - len = normalize(s, len); + if (len > 0) { + len = normalize(s, len); + } return len; } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java index 044ce997503..18b246587b1 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java @@ -18,9 +18,13 @@ package org.apache.lucene.analysis.ar; */ import java.io.IOException; +import java.io.Reader; import java.io.StringReader; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; /** * Test the Arabic Normalization Filter @@ -88,5 +92,16 @@ public class TestArabicNormalizationFilter extends BaseTokenStreamTestCase { ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream); assertTokenStreamContents(filter, new String[]{expected}); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new ArabicNormalizationFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java index 80039e54843..a87ab7715ea 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java @@ -18,9 +18,13 @@ package org.apache.lucene.analysis.ar; */ import java.io.IOException; +import java.io.Reader; import java.io.StringReader; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.util.CharArraySet; @@ -128,4 +132,15 @@ public class TestArabicStemFilter extends BaseTokenStreamTestCase { ArabicStemFilter filter = new ArabicStemFilter(tokenStream); assertTokenStreamContents(filter, new String[]{expected}); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new ArabicStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java index 9efd856604b..21f256e4b60 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java @@ -18,10 +18,14 @@ package org.apache.lucene.analysis.bg; */ import java.io.IOException; +import java.io.Reader; import java.io.StringReader; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.Version; @@ -221,4 +225,15 @@ public class TestBulgarianStemmer extends BaseTokenStreamTestCase { new KeywordMarkerFilter(tokenStream, set)); assertTokenStreamContents(filter, new String[] { "строй", "строеве" }); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new BulgarianStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java index 5d3b42e0704..427015fd504 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java @@ -18,11 +18,13 @@ package org.apache.lucene.analysis.br; */ import java.io.IOException; +import java.io.Reader; import java.io.StringReader; -import java.util.Collections; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.LowerCaseTokenizer; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.util.CharArraySet; @@ -162,4 +164,15 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, new BrazilianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new BrazilianStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } \ No newline at end of file diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java index 085a9f8cea7..26803de878a 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java @@ -23,13 +23,13 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CharReader; -import org.apache.lucene.analysis.CharStream; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.charfilter.MappingCharFilter; import org.apache.lucene.analysis.charfilter.NormalizeCharMap; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; @@ -277,4 +277,15 @@ public class TestCJKAnalyzer extends BaseTokenStreamTestCase { public void testRandomHugeStrings() throws Exception { checkRandomData(random, new CJKAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new CJKBigramFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthFilter.java index 4a2efe32f4f..c3d54e381b5 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthFilter.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; /** * Tests for {@link CJKWidthFilter} @@ -64,4 +65,15 @@ public class TestCJKWidthFilter extends BaseTokenStreamTestCase { public void testRandomData() throws IOException { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new CJKWidthFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java index eba1d1fd124..58dfe708625 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java @@ -32,6 +32,7 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.charfilter.MappingCharFilter; import org.apache.lucene.analysis.charfilter.NormalizeCharMap; import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -361,4 +362,30 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { }; checkRandomData(random, b, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws Exception { + final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def"); + Analyzer a = new Analyzer() { + + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict)); + } + }; + checkOneTermReuse(a, "", ""); + + InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); + final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); + Analyzer b = new Analyzer() { + + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + TokenFilter filter = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, hyphenator); + return new TokenStreamComponents(tokenizer, filter); + } + }; + checkOneTermReuse(b, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java index 835a47e7471..eedfa6a2933 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java @@ -18,10 +18,14 @@ package org.apache.lucene.analysis.cz; */ import java.io.IOException; +import java.io.Reader; import java.io.StringReader; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.util.CharArraySet; @@ -282,4 +286,15 @@ public class TestCzechStemmer extends BaseTokenStreamTestCase { assertTokenStreamContents(filter, new String[] { "hole", "desk" }); } + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new CzechStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } + } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java index 3c3528d6a9f..c5a5fb991af 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -49,4 +50,15 @@ public class TestGermanLightStemFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new GermanLightStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java index cc1a669e341..5bb55bfe76f 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -61,4 +62,15 @@ public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new GermanMinimalStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilter.java index 4bdeaacc890..93c7ca59824 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilter.java @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; /** * Tests {@link GermanNormalizationFilter} @@ -65,4 +66,15 @@ public class TestGermanNormalizationFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new GermanNormalizationFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java index 27e9a846338..c9b3590728c 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis.de; * limitations under the License. */ +import java.io.IOException; import java.io.InputStream; import java.io.Reader; @@ -61,4 +62,15 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new GermanStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemmer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemmer.java index 8b0192e1555..8054c1fd4ae 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemmer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemmer.java @@ -17,8 +17,13 @@ package org.apache.lucene.analysis.el; * limitations under the License. */ +import java.io.IOException; +import java.io.Reader; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; public class TestGreekStemmer extends BaseTokenStreamTestCase { Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT); @@ -522,4 +527,15 @@ public class TestGreekStemmer extends BaseTokenStreamTestCase { checkOneTerm(a, "αρχοντασ", "αρχοντ"); checkOneTerm(a, "αρχοντων", "αρχοντ"); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new GreekStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java index 43c269627e6..d40db25306c 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; /** * Simple tests for {@link EnglishMinimalStemFilter} @@ -55,4 +56,15 @@ public class TestEnglishMinimalStemFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new EnglishMinimalStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestKStemmer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestKStemmer.java index 3449f81e85a..806952a3c1e 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestKStemmer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestKStemmer.java @@ -19,12 +19,14 @@ package org.apache.lucene.analysis.en; import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary; +import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; /** * Tests for {@link KStemmer} @@ -51,6 +53,17 @@ public class TestKStemmer extends BaseTokenStreamTestCase { public void testVocabulary() throws Exception { assertVocabulary(a, getDataFile("kstemTestData.zip"), "kstem_examples.txt"); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new KStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } /****** requires original java kstem source code to create map public void testCreateMap() throws Exception { diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java index 0aec8d6771c..5c3792072f6 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java @@ -22,6 +22,7 @@ import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.Analyzer; @@ -64,4 +65,15 @@ public class TestPorterStemFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, a, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new PorterStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java index daaca467161..147f7bb9e02 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -49,4 +50,15 @@ public class TestSpanishLightStemFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new SpanishLightStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java index 08193aecf97..8ca39335ba5 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java @@ -18,10 +18,14 @@ package org.apache.lucene.analysis.fa; */ import java.io.IOException; +import java.io.Reader; import java.io.StringReader; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ar.ArabicLetterTokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; /** * Test the Persian Normalization Filter @@ -60,5 +64,16 @@ public class TestPersianNormalizationFilter extends BaseTokenStreamTestCase { tokenStream); assertTokenStreamContents(filter, new String[]{expected}); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new PersianNormalizationFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java index 5cd64550441..85b15beaab7 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -49,4 +50,15 @@ public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new FinnishLightStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElision.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElision.java index 1223e014132..eb8d9d1f9d3 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElision.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElision.java @@ -18,13 +18,16 @@ package org.apache.lucene.analysis.fr; */ import java.io.IOException; +import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.List; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; @@ -53,5 +56,16 @@ public class TestElision extends BaseTokenStreamTestCase { } return tas; } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new ElisionFilter(TEST_VERSION_CURRENT, tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java index 57eb8adb782..75ec0765d12 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -163,4 +164,15 @@ public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new FrenchLightStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java index e6fb11fb618..3ea0813d166 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -63,4 +64,15 @@ public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new FrenchMinimalStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishLowerCaseFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishLowerCaseFilter.java index 6725292c967..6392d858a82 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishLowerCaseFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishLowerCaseFilter.java @@ -17,11 +17,16 @@ package org.apache.lucene.analysis.ga; * limitations under the License. */ +import java.io.IOException; +import java.io.Reader; import java.io.StringReader; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; /** * Test the Irish lowercase filter. @@ -38,4 +43,15 @@ public class TestIrishLowerCaseFilter extends BaseTokenStreamTestCase { assertTokenStreamContents(filter, new String[] {"n-athair", "t-uisce", "hard",}); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new IrishLowerCaseFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java index f8be16e4edb..7dd2d5b7f36 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java @@ -17,12 +17,14 @@ package org.apache.lucene.analysis.gl; * limitations under the License. */ +import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; /** * Simple tests for {@link GalicianMinimalStemmer} @@ -52,4 +54,15 @@ public class TestGalicianMinimalStemFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, a, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new GalicianMinimalStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianStemFilter.java index c48b3412742..c87ccf9f12c 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianStemFilter.java @@ -26,6 +26,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; @@ -48,4 +49,15 @@ public class TestGalicianStemFilter extends BaseTokenStreamTestCase { public void testVocabulary() throws IOException { assertVocabulary(analyzer, getDataFile("gltestdata.zip"), "gl.txt"); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new GalicianStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiNormalizer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiNormalizer.java index 894872955e7..727607ca144 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiNormalizer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiNormalizer.java @@ -18,12 +18,15 @@ package org.apache.lucene.analysis.hi; */ import java.io.IOException; +import java.io.Reader; import java.io.StringReader; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; /** * Test HindiNormalizer @@ -63,4 +66,15 @@ public class TestHindiNormalizer extends BaseTokenStreamTestCase { TokenFilter tf = new HindiNormalizationFilter(tokenizer); assertTokenStreamContents(tf, new String[] { output }); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new HindiNormalizationFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java index b48935bc73f..c812b6a3f1a 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java @@ -18,12 +18,15 @@ package org.apache.lucene.analysis.hi; */ import java.io.IOException; +import java.io.Reader; import java.io.StringReader; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; /** * Test HindiStemmer @@ -85,4 +88,15 @@ public class TestHindiStemmer extends BaseTokenStreamTestCase { TokenFilter tf = new HindiStemFilter(tokenizer); assertTokenStreamContents(tf, new String[] { output }); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new HindiStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java index 90e4768db31..a53a3ee3dc7 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -44,4 +45,15 @@ public class TestHungarianLightStemFilter extends BaseTokenStreamTestCase { public void testVocabulary() throws IOException { assertVocabulary(analyzer, getDataFile("hulighttestdata.zip"), "hulight.txt"); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new HungarianLightStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java index a5cb1ef3b3a..70f2be08342 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java @@ -27,6 +27,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.util.CharArraySet; import org.junit.BeforeClass; @@ -73,4 +74,15 @@ public class HunspellStemFilterTest extends BaseTokenStreamTestCase { }; checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemmer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemmer.java index 2d9f832dc23..e3b4e40a998 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemmer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemmer.java @@ -132,4 +132,15 @@ public class TestIndonesianStemmer extends BaseTokenStreamTestCase { checkOneTermReuse(a, "bukukah", "buku"); checkOneTermReuse(a, "gigi", "gigi"); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java index e26f6e82ed3..4705bc0a660 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java @@ -18,12 +18,15 @@ package org.apache.lucene.analysis.in; */ import java.io.IOException; +import java.io.Reader; import java.io.StringReader; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; /** * Test IndicNormalizer @@ -48,4 +51,15 @@ public class TestIndicNormalizer extends BaseTokenStreamTestCase { TokenFilter tf = new IndicNormalizationFilter(tokenizer); assertTokenStreamContents(tf, new String[] { output }); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new IndicNormalizationFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilter.java index 90f96168dff..44ee495030a 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilter.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -49,4 +50,15 @@ public class TestItalianLightStemFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new ItalianLightStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java index 68201d11b9e..8aadb2e8997 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; /** * Basic tests for {@link LatvianStemmer} @@ -268,4 +269,15 @@ public class TestLatvianStemmer extends BaseTokenStreamTestCase { checkOneTerm(a, "usa", "usa"); // length checkOneTerm(a, "60ms", "60ms"); // vowel count } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new LatvianStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java index 27e8d054ad7..322704056a0 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java @@ -22,8 +22,10 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.List; @@ -1923,4 +1925,15 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase { }; checkRandomData(random, a, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new ASCIIFoldingFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java index 95f3e6a5efb..b6edc6aeb05 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java @@ -29,6 +29,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import static org.apache.lucene.analysis.miscellaneous.CapitalizationFilter.*; @@ -133,4 +134,15 @@ public class TestCapitalizationFilter extends BaseTokenStreamTestCase { checkRandomData(random, a, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new CapitalizationFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestHyphenatedWordsFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestHyphenatedWordsFilter.java index 54931a8d117..e1347803d23 100755 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestHyphenatedWordsFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestHyphenatedWordsFilter.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis.miscellaneous; +import java.io.IOException; import java.io.Reader; import java.io.StringReader; @@ -25,6 +26,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; /** * HyphenatedWordsFilter test @@ -74,4 +76,15 @@ public class TestHyphenatedWordsFilter extends BaseTokenStreamTestCase { checkRandomData(random, a, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new HyphenatedWordsFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java index 5cc5dbe37d1..f2abf52aac5 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java @@ -18,6 +18,10 @@ package org.apache.lucene.analysis.miscellaneous; */ import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.core.KeywordTokenizer; + +import java.io.IOException; +import java.io.Reader; import java.io.StringReader; public class TestLengthFilter extends BaseTokenStreamTestCase { @@ -41,5 +45,16 @@ public class TestLengthFilter extends BaseTokenStreamTestCase { new int[]{1, 4, 2} ); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new LengthFilter(true, tokenizer, 0, 5)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilter.java index 2effc37abec..96df848a6d4 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilter.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -31,6 +32,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util._TestUtil; +import java.io.IOException; import java.io.Reader; import java.util.Iterator; import java.util.Arrays; @@ -164,5 +166,16 @@ public class TestRemoveDuplicatesTokenFilter extends BaseTokenStreamTestCase { checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER); } } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java index 5c53da0926c..0179b94e353 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java @@ -27,6 +27,7 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.tokenattributes.*; /** @@ -130,4 +131,15 @@ public class TestTrimFilter extends BaseTokenStreamTestCase { }; checkRandomData(random, b, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, random.nextBoolean())); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java index 4f64510bc89..754116c4f60 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java @@ -18,7 +18,10 @@ package org.apache.lucene.analysis.miscellaneous; import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.Analyzer.TokenStreamComponents; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.cz.CzechStemFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -322,4 +325,26 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase { checkRandomData(random, a, 10000*RANDOM_MULTIPLIER); } } + + public void testEmptyTerm() throws IOException { + for (int i = 0; i < 512; i++) { + final int flags = i; + final CharArraySet protectedWords; + if (random.nextBoolean()) { + protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet(Arrays.asList("a", "b", "cd")), false); + } else { + protectedWords = null; + } + + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords)); + } + }; + // depending upon options, this thing may or may not preserve the empty term + checkAnalysisConsistency(random, a, random.nextBoolean(), ""); + } + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java index e46fd529672..e8e7f6cf4ad 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; @@ -152,4 +153,26 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { }; checkRandomData(random, b, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws Exception { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, + new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.FRONT, 2, 15)); + } + }; + checkAnalysisConsistency(random, a, random.nextBoolean(), ""); + + Analyzer b = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, + new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15)); + } + }; + checkAnalysisConsistency(random, b, random.nextBoolean(), ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java index ec93076b27b..3375c027057 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java @@ -23,7 +23,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.Analyzer.TokenStreamComponents; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; @@ -132,4 +132,16 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { }; checkRandomData(random, a, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws Exception { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, + new NGramTokenFilter(tokenizer, 2, 15)); + } + }; + checkAnalysisConsistency(random, a, random.nextBoolean(), ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java index fdbb956bd64..8dcca3502e3 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -50,4 +51,15 @@ public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new NorwegianLightStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java index 855ab36f441..0c137a52bcd 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -50,4 +51,15 @@ public class TestNorwegianMinimalStemFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new NorwegianMinimalStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java index 3efb23f1f01..b2f1ee1f93b 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java @@ -22,7 +22,9 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.regex.Pattern; @@ -103,5 +105,16 @@ public class TestPatternReplaceFilter extends BaseTokenStreamTestCase { }; checkRandomData(random, b, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new PatternReplaceFilter(tokenizer, Pattern.compile("a"), "b", true)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java index a5b6ec283da..943472fbf63 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; @@ -96,4 +97,15 @@ public class TestPortugueseLightStemFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new PortugueseLightStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java index 1e6afe843da..2e0fef348f7 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; @@ -70,4 +71,15 @@ public class TestPortugueseMinimalStemFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new PortugueseMinimalStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java index c71c8d6fbf0..4c55abbb906 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java @@ -26,6 +26,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; @@ -70,4 +71,15 @@ public class TestPortugueseStemFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new PortugueseStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java index 15cf33f136f..f7ee3ce60b2 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis.reverse; +import java.io.IOException; import java.io.Reader; import java.io.StringReader; @@ -25,6 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.util.Version; public class TestReverseStringFilter extends BaseTokenStreamTestCase { @@ -111,4 +113,15 @@ public class TestReverseStringFilter extends BaseTokenStreamTestCase { }; checkRandomData(random, a, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new ReverseStringFilter(TEST_VERSION_CURRENT, tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java index 015a7726810..5e86fee53fa 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -49,4 +50,15 @@ public class TestRussianLightStemFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new RussianLightStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java index 56e369ed126..1dadc3d54e5 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java @@ -27,6 +27,7 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.tokenattributes.*; @@ -1156,4 +1157,15 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase { }; checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new ShingleFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java index 6b5ae1844fe..36bc26233a1 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java @@ -17,10 +17,15 @@ package org.apache.lucene.analysis.snowball; * limitations under the License. */ +import java.io.IOException; +import java.io.Reader; + import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.index.Payload; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; @@ -136,4 +141,23 @@ public class TestSnowball extends BaseTokenStreamTestCase { return true; } } + + public void testEmptyTerm() throws IOException { + String langs[] = { + "Armenian", "Basque", "Catalan", "Danish", "Dutch", "English", + "Finnish", "French", "German2", "German", "Hungarian", "Irish", + "Italian", "Kp", "Lovins", "Norwegian", "Porter", "Portuguese", + "Romanian", "Russian", "Spanish", "Swedish", "Turkish" + }; + for (final String lang : langs) { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new SnowballFilter(tokenizer, lang)); + } + }; + checkOneTermReuse(a, "", ""); + } + } } \ No newline at end of file diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java index ae30b9be3cd..fb5d604324f 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -49,4 +50,15 @@ public class TestSwedishLightStemFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new SwedishLightStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java index 01638c6137e..ba1f34dfeb2 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis.synonym; +import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; @@ -32,6 +33,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util._TestUtil; @@ -428,6 +430,29 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase { } } + public void testEmptyTerm() throws IOException { + final int numIters = atLeast(10); + for (int i = 0; i < numIters; i++) { + b = new SynonymMap.Builder(random.nextBoolean()); + final int numEntries = atLeast(10); + for (int j = 0; j < numEntries; j++) { + add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean()); + } + final SynonymMap map = b.build(); + final boolean ignoreCase = random.nextBoolean(); + + final Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase)); + } + }; + + checkAnalysisConsistency(random, analyzer, random.nextBoolean(), ""); + } + } + /** simple random test like testRandom2, but for large docs */ public void testRandomHuge() throws Exception { diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java index 5d359c65dc7..b50d74ae01d 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java @@ -17,10 +17,15 @@ package org.apache.lucene.analysis.th; * limitations under the License. */ +import java.io.IOException; +import java.io.Reader; import java.io.StringReader; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; import org.apache.lucene.analysis.util.CharArraySet; @@ -183,4 +188,15 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase { ts.addAttribute(FlagsAttribute.class); assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" }); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new ThaiWordFilter(TEST_VERSION_CURRENT, tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java index 7e4fb394961..4d6ab9390f1 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java @@ -17,11 +17,16 @@ package org.apache.lucene.analysis.tr; * limitations under the License. */ +import java.io.IOException; +import java.io.Reader; import java.io.StringReader; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; /** * Test the Turkish lowercase filter. @@ -62,4 +67,15 @@ public class TestTurkishLowerCaseFilter extends BaseTokenStreamTestCase { assertTokenStreamContents(filter, new String[] {"i\u0316stanbul", "izmir", "\u0131\u0316sparta",}); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new TurkishLowerCaseFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilter.java b/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilter.java index 3cb16cbab2c..4b134427dfb 100644 --- a/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilter.java +++ b/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilter.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.core.KeywordTokenizer; /** * Tests ICUFoldingFilter @@ -77,4 +78,15 @@ public class TestICUFoldingFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, a, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new ICUFoldingFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2Filter.java b/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2Filter.java index 267920debcd..05a5cd7724a 100644 --- a/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2Filter.java +++ b/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2Filter.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.core.KeywordTokenizer; import com.ibm.icu.text.Normalizer2; @@ -77,4 +78,15 @@ public class TestICUNormalizer2Filter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, a, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java b/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java index c39a1a13fd4..68ddf94b7ca 100644 --- a/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java +++ b/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java @@ -100,4 +100,15 @@ public class TestICUTransformFilter extends BaseTokenStreamTestCase { }; checkRandomData(random, a, 1000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, Transliterator.getInstance("Any-Latin"))); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java index ca0d4548d29..91d194209ab 100644 --- a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java +++ b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java @@ -23,6 +23,7 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; public class TestKuromojiBaseFormFilter extends BaseTokenStreamTestCase { private Analyzer analyzer = new Analyzer() { @@ -47,4 +48,15 @@ public class TestKuromojiBaseFormFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws IOException { checkRandomData(random, analyzer, atLeast(10000)); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new KuromojiBaseFormFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java index 33ab248d7c6..c61542cb1fa 100644 --- a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java +++ b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import java.io.IOException; import java.io.Reader; @@ -68,4 +69,15 @@ public class TestKuromojiKatakanaStemFilter extends BaseTokenStreamTestCase { public void testRandomData() throws IOException { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new KuromojiKatakanaStemFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiReadingFormFilter.java b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiReadingFormFilter.java index 1f237bb681c..32b04323676 100644 --- a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiReadingFormFilter.java +++ b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiReadingFormFilter.java @@ -20,6 +20,7 @@ package org.apache.lucene.analysis.kuromoji; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import java.io.IOException; import java.io.Reader; @@ -61,4 +62,15 @@ public class TestKuromojiReadingFormFilter extends BaseTokenStreamTestCase { checkRandomData(random, katakanaAnalyzer, 1000*RANDOM_MULTIPLIER); checkRandomData(random, romajiAnalyzer, 1000*RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new KuromojiReadingFormFilter(tokenizer)); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java b/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java index 181e4f56f06..79aea27abb8 100644 --- a/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java +++ b/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.analysis.phonetic; +import java.io.IOException; import java.io.Reader; import java.io.StringReader; @@ -24,7 +25,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.Analyzer.TokenStreamComponents; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.util._TestUtil; @@ -95,4 +96,15 @@ public class DoubleMetaphoneFilterTest extends BaseTokenStreamTestCase { }; checkRandomData(random, b, 1000 * RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, 8, random.nextBoolean())); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilter.java b/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilter.java index 9f9ef9dda56..60a3b1a7cad 100644 --- a/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilter.java +++ b/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilter.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis.phonetic; * limitations under the License. */ +import java.io.IOException; import java.io.Reader; import java.util.HashSet; @@ -28,6 +29,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.junit.Ignore; /** Tests {@link BeiderMorseFilter} */ @@ -91,4 +93,15 @@ public class TestBeiderMorseFilter extends BaseTokenStreamTestCase { public void testRandom() throws Exception { checkRandomData(random, analyzer, 1000 * RANDOM_MULTIPLIER); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new BeiderMorseFilter(tokenizer, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true))); + } + }; + checkOneTermReuse(a, "", ""); + } } diff --git a/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java b/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java index 674699f7edd..fd56adab6d2 100644 --- a/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java +++ b/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java @@ -31,6 +31,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; /** @@ -103,4 +104,20 @@ public class TestPhoneticFilter extends BaseTokenStreamTestCase { checkRandomData(random, b, 1000*RANDOM_MULTIPLIER); } } + + public void testEmptyTerm() throws IOException { + Encoder encoders[] = new Encoder[] { + new Metaphone(), new DoubleMetaphone(), new Soundex(), new RefinedSoundex(), new Caverphone() + }; + for (final Encoder e : encoders) { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, random.nextBoolean())); + } + }; + checkOneTermReuse(a, "", ""); + } + } } diff --git a/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java b/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java index 77489f440c6..e232a7bdf48 100644 --- a/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java +++ b/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis.cn.smart; +import java.io.IOException; import java.io.Reader; import java.io.StringReader; @@ -26,6 +27,7 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; import org.apache.lucene.util.Version; @@ -228,4 +230,15 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase { public void testRandomHugeStrings() throws Exception { checkRandomData(random, new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192); } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new WordTokenFilter(tokenizer)); + } + }; + checkAnalysisConsistency(random, a, random.nextBoolean(), ""); + } }