diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java new file mode 100644 index 00000000000..a859897037b --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java @@ -0,0 +1,67 @@ +package org.apache.lucene.analysis.cjk; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +public class TestCJKBigramFilter extends BaseTokenStreamTestCase { + Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(t, new CJKBigramFilter(t)); + } + }; + + public void testHuge() throws Exception { + assertAnalyzesTo(analyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた", + new String[] { + "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多", + "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多", + "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多", + "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多", + "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多", + "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多", + "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多", + "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多", + "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多", + "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多", + "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" + } + ); + } + + public void testHanOnly() throws Exception { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN)); + } + }; + assertAnalyzesTo(a, "多くの学生が試験に落ちた。", + new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" }); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java index 73110d6ffe2..199583e9b04 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java @@ -23,8 +23,11 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.util.CharArraySet; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -46,6 +49,19 @@ public class TestGermanLightStemFilter extends BaseTokenStreamTestCase { assertVocabulary(analyzer, getDataFile("delighttestdata.zip"), "delight.txt"); } + public void testKeyword() throws IOException { + final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream sink = new KeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new GermanLightStemFilter(sink)); + } + }; + checkOneTerm(a, "sängerinnen", "sängerinnen"); + } + /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java index b1819d49900..ed45d99a32b 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java @@ -23,8 +23,11 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.util.CharArraySet; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -53,6 +56,19 @@ public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase { checkOneTerm(analyzer, "äpfel", "apfel"); } + public void testKeyword() throws IOException { + final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream sink = new KeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new GermanMinimalStemFilter(sink)); + } + }; + checkOneTerm(a, "sängerinnen", "sängerinnen"); + } + /** Test against a vocabulary from the reference impl */ public void testVocabulary() throws IOException { assertVocabulary(analyzer, getDataFile("deminimaltestdata.zip"), "deminimal.txt"); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java index 34e7d33cbb0..5636f8949d1 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java @@ -23,9 +23,13 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.util.CharArraySet; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -58,6 +62,19 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase { assertAnalyzesTo(analyzer, "", new String[] { "" }); } + public void testKeyword() throws IOException { + final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream sink = new KeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new GermanStemFilter(sink)); + } + }; + checkOneTerm(a, "sängerinnen", "sängerinnen"); + } + /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java index 623a907df15..3f2fa093ef2 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java @@ -23,8 +23,11 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.util.CharArraySet; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -46,6 +49,19 @@ public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase { assertVocabulary(analyzer, getDataFile("filighttestdata.zip"), "filight.txt"); } + public void testKeyword() throws IOException { + final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("edeltäjistään"), false); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream sink = new KeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new FinnishLightStemFilter(sink)); + } + }; + checkOneTerm(a, "edeltäjistään", "edeltäjistään"); + } + /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java index d691e937a73..f280f0ae1da 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java @@ -23,8 +23,11 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.util.CharArraySet; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -176,6 +179,19 @@ public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase { assertVocabulary(analyzer, getDataFile("frlighttestdata.zip"), "frlight.txt"); } + public void testKeyword() throws IOException { + final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("chevaux"), false); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream sink = new KeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new FrenchLightStemFilter(sink)); + } + }; + checkOneTerm(a, "chevaux", "chevaux"); + } + /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java index 3afcf236877..4772862b044 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java @@ -23,8 +23,11 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.util.CharArraySet; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -55,6 +58,19 @@ public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase { checkOneTerm(analyzer, "baron", "baron"); } + public void testKeyword() throws IOException { + final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("chevaux"), false); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream sink = new KeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new FrenchMinimalStemFilter(sink)); + } + }; + checkOneTerm(a, "chevaux", "chevaux"); + } + /** Test against a vocabulary from the reference impl */ public void testVocabulary() throws IOException { assertVocabulary(analyzer, getDataFile("frminimaltestdata.zip"), "frminimal.txt"); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java index cc8dfe7a903..a81fe554ff0 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java @@ -23,8 +23,11 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.util.CharArraySet; /** * Simple tests for {@link GalicianMinimalStemmer} @@ -50,6 +53,19 @@ public class TestGalicianMinimalStemFilter extends BaseTokenStreamTestCase { checkOneTerm(a, "barcelonês", "barcelonês"); } + public void testKeyword() throws IOException { + final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("elefantes"), false); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream sink = new KeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new GalicianMinimalStemFilter(sink)); + } + }; + checkOneTerm(a, "elefantes", "elefantes"); + } + /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java index 25654aa6d71..92f60647a09 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java @@ -23,8 +23,11 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.util.CharArraySet; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -46,6 +49,19 @@ public class TestHungarianLightStemFilter extends BaseTokenStreamTestCase { assertVocabulary(analyzer, getDataFile("hulighttestdata.zip"), "hulight.txt"); } + public void testKeyword() throws IOException { + final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("babakocsi"), false); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream sink = new KeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new HungarianLightStemFilter(sink)); + } + }; + checkOneTerm(a, "babakocsi", "babakocsi"); + } + public void testEmptyTerm() throws IOException { Analyzer a = new Analyzer() { @Override diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java index 6ab3ba19230..d734850c335 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.util.CharArrayMap; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.Version; @@ -147,6 +148,12 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase { checkOneTerm(a, "fiets", "fiets"); } + public void testEmptyStemDictionary() throws IOException { + DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, + CharArraySet.EMPTY_SET, CharArrayMap.emptyMap()); + checkOneTerm(a, "fiets", "fiet"); + } + /** * Test that stopwords are not case sensitive */ diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java index 075ebad8330..36e92154fd0 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java @@ -25,8 +25,11 @@ import java.util.Random; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.util.CharArraySet; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -47,6 +50,19 @@ public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase { public void testVocabulary() throws IOException { assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_light.txt"))); } + + public void testKeyword() throws IOException { + final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sekretæren"), false); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream sink = new KeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new NorwegianLightStemFilter(sink)); + } + }; + checkOneTerm(a, "sekretæren", "sekretæren"); + } /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java index c45eb153218..e19f656f2f5 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java @@ -25,8 +25,11 @@ import java.util.Random; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.util.CharArraySet; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -47,6 +50,19 @@ public class TestNorwegianMinimalStemFilter extends BaseTokenStreamTestCase { public void testVocabulary() throws IOException { assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_minimal.txt"))); } + + public void testKeyword() throws IOException { + final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sekretæren"), false); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream sink = new KeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(sink)); + } + }; + checkOneTerm(a, "sekretæren", "sekretæren"); + } /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java index f7111666b00..9f89a25fe65 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java @@ -22,11 +22,15 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.Analyzer.TokenStreamComponents; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.util.CharArraySet; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -93,6 +97,19 @@ public class TestPortugueseLightStemFilter extends BaseTokenStreamTestCase { assertVocabulary(analyzer, getDataFile("ptlighttestdata.zip"), "ptlight.txt"); } + public void testKeyword() throws IOException { + final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream sink = new KeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new PortugueseLightStemFilter(sink)); + } + }; + checkOneTerm(a, "quilométricas", "quilométricas"); + } + /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java index f0f7c5ba4c4..b7928fb945d 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java @@ -22,11 +22,15 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.Analyzer.TokenStreamComponents; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.util.CharArraySet; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -67,6 +71,19 @@ public class TestPortugueseMinimalStemFilter extends BaseTokenStreamTestCase { assertVocabulary(analyzer, getDataFile("ptminimaltestdata.zip"), "ptminimal.txt"); } + public void testKeyword() throws IOException { + final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream sink = new KeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(sink)); + } + }; + checkOneTerm(a, "quilométricas", "quilométricas"); + } + /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java index c919701ae79..5ede24a44a2 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java @@ -24,11 +24,14 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.util.CharArraySet; /** * Simple tests for {@link PortugueseStemFilter} @@ -67,6 +70,19 @@ public class TestPortugueseStemFilter extends BaseTokenStreamTestCase { assertVocabulary(analyzer, getDataFile("ptrslptestdata.zip"), "ptrslp.txt"); } + public void testKeyword() throws IOException { + final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream sink = new KeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new PortugueseStemFilter(sink)); + } + }; + checkOneTerm(a, "quilométricas", "quilométricas"); + } + /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java index dc992fbff8c..c5ad5744963 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java @@ -23,8 +23,11 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.util.CharArraySet; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -46,6 +49,19 @@ public class TestRussianLightStemFilter extends BaseTokenStreamTestCase { assertVocabulary(analyzer, getDataFile("rulighttestdata.zip"), "rulight.txt"); } + public void testKeyword() throws IOException { + final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("энергии"), false); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream sink = new KeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new RussianLightStemFilter(sink)); + } + }; + checkOneTerm(a, "энергии", "энергии"); + } + /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java index b2966b23c86..484c88f9d64 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java @@ -23,8 +23,11 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.util.CharArraySet; import static org.apache.lucene.analysis.VocabularyAssert.*; @@ -46,6 +49,19 @@ public class TestSwedishLightStemFilter extends BaseTokenStreamTestCase { assertVocabulary(analyzer, getDataFile("svlighttestdata.zip"), "svlight.txt"); } + public void testKeyword() throws IOException { + final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("jaktkarlens"), false); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream sink = new KeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new SwedishLightStemFilter(sink)); + } + }; + checkOneTerm(a, "jaktkarlens", "jaktkarlens"); + } + /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java index 279801be4e6..ed8ad8aaa0a 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java @@ -68,6 +68,13 @@ public class TestTurkishLowerCaseFilter extends BaseTokenStreamTestCase { "\u0131\u0316sparta",}); } + public void testDecomposed3() throws Exception { + TokenStream stream = new MockTokenizer(new StringReader( + "\u0049\u0307"), MockTokenizer.WHITESPACE, false); + TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream); + assertTokenStreamContents(filter, new String[] {"i"}); + } + public void testEmptyTerm() throws IOException { Analyzer a = new Analyzer() { @Override diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java index fa4f553c22a..c80ffab8dad 100644 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java @@ -20,8 +20,12 @@ package org.apache.lucene.analysis.icu.segmentation; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.icu.ICUNormalizer2Filter; +import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute; + +import com.ibm.icu.lang.UScript; import java.io.IOException; import java.io.Reader; @@ -243,4 +247,18 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { Random random = random(); checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192); } + + public void testTokenAttributes() throws Exception { + TokenStream ts = a.tokenStream("dummy", new StringReader("This is a test")); + ScriptAttribute scriptAtt = ts.addAttribute(ScriptAttribute.class); + ts.reset(); + while (ts.incrementToken()) { + assertEquals(UScript.LATIN, scriptAtt.getCode()); + assertEquals(UScript.getName(UScript.LATIN), scriptAtt.getName()); + assertEquals(UScript.getShortName(UScript.LATIN), scriptAtt.getShortName()); + assertTrue(ts.reflectAsString(false).contains("script=Latin")); + } + ts.end(); + ts.close(); + } } diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilter.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilter.java index ec4db98f945..84f4d8d62c0 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilter.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilter.java @@ -22,8 +22,11 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.util.CharArraySet; public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase { private Analyzer analyzer = new Analyzer() { @@ -40,6 +43,21 @@ public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase { ); } + public void testKeyword() throws IOException { + final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("あり"), false); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.DEFAULT_MODE); + TokenStream sink = new KeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new JapaneseBaseFormFilter(sink)); + } + }; + assertAnalyzesTo(a, "それはまだ実験段階にあります", + new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます" } + ); + } + public void testEnglish() throws IOException { assertAnalyzesTo(analyzer, "this atest", new String[] { "this", "atest" }); diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java index 84c2f5c9790..45a2e41b699 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java @@ -20,8 +20,11 @@ package org.apache.lucene.analysis.ja; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.util.CharArraySet; import java.io.IOException; import java.io.Reader; @@ -60,6 +63,19 @@ public class TestJapaneseKatakanaStemFilter extends BaseTokenStreamTestCase { new int[] { 0, 4, 9, 14, 20, 25 }, new int[] { 3, 8, 13, 19, 24, 29 }); } + + public void testKeyword() throws IOException { + final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("コーヒー"), false); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream sink = new KeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new JapaneseKatakanaStemFilter(sink)); + } + }; + checkOneTerm(a, "コーヒー", "コーヒー"); + } public void testUnsupportedHalfWidthVariants() throws IOException { // The below result is expected since only full-width katakana is supported