test some untested analysis corner cases

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1361896 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-07-16 03:34:46 +00:00
parent dc27d24e0b
commit 7002e7cb09
21 changed files with 376 additions and 0 deletions

View File

@ -0,0 +1,67 @@
package org.apache.lucene.analysis.cjk;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
public class TestCJKBigramFilter extends BaseTokenStreamTestCase {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(t, new CJKBigramFilter(t));
}
};
public void testHuge() throws Exception {
assertAnalyzesTo(analyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた",
new String[] {
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた"
}
);
}
public void testHanOnly() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN));
}
};
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
new String[] { "", "", "", "学生", "", "試験", "", "", "", "" });
}
}

View File

@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*; import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -46,6 +49,19 @@ public class TestGermanLightStemFilter extends BaseTokenStreamTestCase {
assertVocabulary(analyzer, getDataFile("delighttestdata.zip"), "delight.txt"); assertVocabulary(analyzer, getDataFile("delighttestdata.zip"), "delight.txt");
} }
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new GermanLightStemFilter(sink));
}
};
checkOneTerm(a, "sängerinnen", "sängerinnen");
}
/** blast some random strings through the analyzer */ /** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);

View File

@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*; import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -53,6 +56,19 @@ public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
checkOneTerm(analyzer, "äpfel", "apfel"); checkOneTerm(analyzer, "äpfel", "apfel");
} }
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new GermanMinimalStemFilter(sink));
}
};
checkOneTerm(a, "sängerinnen", "sängerinnen");
}
/** Test against a vocabulary from the reference impl */ /** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException { public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("deminimaltestdata.zip"), "deminimal.txt"); assertVocabulary(analyzer, getDataFile("deminimaltestdata.zip"), "deminimal.txt");

View File

@ -23,9 +23,13 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*; import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -58,6 +62,19 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "", new String[] { "" }); assertAnalyzesTo(analyzer, "", new String[] { "" });
} }
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new GermanStemFilter(sink));
}
};
checkOneTerm(a, "sängerinnen", "sängerinnen");
}
/** blast some random strings through the analyzer */ /** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);

View File

@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*; import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -46,6 +49,19 @@ public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase {
assertVocabulary(analyzer, getDataFile("filighttestdata.zip"), "filight.txt"); assertVocabulary(analyzer, getDataFile("filighttestdata.zip"), "filight.txt");
} }
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("edeltäjistään"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new FinnishLightStemFilter(sink));
}
};
checkOneTerm(a, "edeltäjistään", "edeltäjistään");
}
/** blast some random strings through the analyzer */ /** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);

View File

@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*; import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -176,6 +179,19 @@ public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase {
assertVocabulary(analyzer, getDataFile("frlighttestdata.zip"), "frlight.txt"); assertVocabulary(analyzer, getDataFile("frlighttestdata.zip"), "frlight.txt");
} }
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("chevaux"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new FrenchLightStemFilter(sink));
}
};
checkOneTerm(a, "chevaux", "chevaux");
}
/** blast some random strings through the analyzer */ /** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);

View File

@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*; import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -55,6 +58,19 @@ public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase {
checkOneTerm(analyzer, "baron", "baron"); checkOneTerm(analyzer, "baron", "baron");
} }
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("chevaux"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new FrenchMinimalStemFilter(sink));
}
};
checkOneTerm(a, "chevaux", "chevaux");
}
/** Test against a vocabulary from the reference impl */ /** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException { public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("frminimaltestdata.zip"), "frminimal.txt"); assertVocabulary(analyzer, getDataFile("frminimaltestdata.zip"), "frminimal.txt");

View File

@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
/** /**
* Simple tests for {@link GalicianMinimalStemmer} * Simple tests for {@link GalicianMinimalStemmer}
@ -50,6 +53,19 @@ public class TestGalicianMinimalStemFilter extends BaseTokenStreamTestCase {
checkOneTerm(a, "barcelonês", "barcelonês"); checkOneTerm(a, "barcelonês", "barcelonês");
} }
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("elefantes"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new GalicianMinimalStemFilter(sink));
}
};
checkOneTerm(a, "elefantes", "elefantes");
}
/** blast some random strings through the analyzer */ /** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);

View File

@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*; import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -46,6 +49,19 @@ public class TestHungarianLightStemFilter extends BaseTokenStreamTestCase {
assertVocabulary(analyzer, getDataFile("hulighttestdata.zip"), "hulight.txt"); assertVocabulary(analyzer, getDataFile("hulighttestdata.zip"), "hulight.txt");
} }
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("babakocsi"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new HungarianLightStemFilter(sink));
}
};
checkOneTerm(a, "babakocsi", "babakocsi");
}
public void testEmptyTerm() throws IOException { public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() { Analyzer a = new Analyzer() {
@Override @Override

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
@ -147,6 +148,12 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
checkOneTerm(a, "fiets", "fiets"); checkOneTerm(a, "fiets", "fiets");
} }
public void testEmptyStemDictionary() throws IOException {
DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET,
CharArraySet.EMPTY_SET, CharArrayMap.<String>emptyMap());
checkOneTerm(a, "fiets", "fiet");
}
/** /**
* Test that stopwords are not case sensitive * Test that stopwords are not case sensitive
*/ */

View File

@ -25,8 +25,11 @@ import java.util.Random;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*; import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -48,6 +51,19 @@ public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase {
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_light.txt"))); assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_light.txt")));
} }
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sekretæren"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new NorwegianLightStemFilter(sink));
}
};
checkOneTerm(a, "sekretæren", "sekretæren");
}
/** blast some random strings through the analyzer */ /** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {
Random random = random(); Random random = random();

View File

@ -25,8 +25,11 @@ import java.util.Random;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*; import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -48,6 +51,19 @@ public class TestNorwegianMinimalStemFilter extends BaseTokenStreamTestCase {
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_minimal.txt"))); assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_minimal.txt")));
} }
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sekretæren"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(sink));
}
};
checkOneTerm(a, "sekretæren", "sekretæren");
}
/** blast some random strings through the analyzer */ /** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {
Random random = random(); Random random = random();

View File

@ -22,11 +22,15 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*; import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -93,6 +97,19 @@ public class TestPortugueseLightStemFilter extends BaseTokenStreamTestCase {
assertVocabulary(analyzer, getDataFile("ptlighttestdata.zip"), "ptlight.txt"); assertVocabulary(analyzer, getDataFile("ptlighttestdata.zip"), "ptlight.txt");
} }
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new PortugueseLightStemFilter(sink));
}
};
checkOneTerm(a, "quilométricas", "quilométricas");
}
/** blast some random strings through the analyzer */ /** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);

View File

@ -22,11 +22,15 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*; import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -67,6 +71,19 @@ public class TestPortugueseMinimalStemFilter extends BaseTokenStreamTestCase {
assertVocabulary(analyzer, getDataFile("ptminimaltestdata.zip"), "ptminimal.txt"); assertVocabulary(analyzer, getDataFile("ptminimaltestdata.zip"), "ptminimal.txt");
} }
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(sink));
}
};
checkOneTerm(a, "quilométricas", "quilométricas");
}
/** blast some random strings through the analyzer */ /** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);

View File

@ -24,11 +24,14 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
/** /**
* Simple tests for {@link PortugueseStemFilter} * Simple tests for {@link PortugueseStemFilter}
@ -67,6 +70,19 @@ public class TestPortugueseStemFilter extends BaseTokenStreamTestCase {
assertVocabulary(analyzer, getDataFile("ptrslptestdata.zip"), "ptrslp.txt"); assertVocabulary(analyzer, getDataFile("ptrslptestdata.zip"), "ptrslp.txt");
} }
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new PortugueseStemFilter(sink));
}
};
checkOneTerm(a, "quilométricas", "quilométricas");
}
/** blast some random strings through the analyzer */ /** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);

View File

@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*; import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -46,6 +49,19 @@ public class TestRussianLightStemFilter extends BaseTokenStreamTestCase {
assertVocabulary(analyzer, getDataFile("rulighttestdata.zip"), "rulight.txt"); assertVocabulary(analyzer, getDataFile("rulighttestdata.zip"), "rulight.txt");
} }
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("энергии"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new RussianLightStemFilter(sink));
}
};
checkOneTerm(a, "энергии", "энергии");
}
/** blast some random strings through the analyzer */ /** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);

View File

@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*; import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -46,6 +49,19 @@ public class TestSwedishLightStemFilter extends BaseTokenStreamTestCase {
assertVocabulary(analyzer, getDataFile("svlighttestdata.zip"), "svlight.txt"); assertVocabulary(analyzer, getDataFile("svlighttestdata.zip"), "svlight.txt");
} }
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("jaktkarlens"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new SwedishLightStemFilter(sink));
}
};
checkOneTerm(a, "jaktkarlens", "jaktkarlens");
}
/** blast some random strings through the analyzer */ /** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);

View File

@ -68,6 +68,13 @@ public class TestTurkishLowerCaseFilter extends BaseTokenStreamTestCase {
"\u0131\u0316sparta",}); "\u0131\u0316sparta",});
} }
public void testDecomposed3() throws Exception {
TokenStream stream = new MockTokenizer(new StringReader(
"\u0049\u0307"), MockTokenizer.WHITESPACE, false);
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
assertTokenStreamContents(filter, new String[] {"i"});
}
public void testEmptyTerm() throws IOException { public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() { Analyzer a = new Analyzer() {
@Override @Override

View File

@ -20,8 +20,12 @@ package org.apache.lucene.analysis.icu.segmentation;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter; import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
import com.ibm.icu.lang.UScript;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
@ -243,4 +247,18 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
Random random = random(); Random random = random();
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192); checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192);
} }
public void testTokenAttributes() throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader("This is a test"));
ScriptAttribute scriptAtt = ts.addAttribute(ScriptAttribute.class);
ts.reset();
while (ts.incrementToken()) {
assertEquals(UScript.LATIN, scriptAtt.getCode());
assertEquals(UScript.getName(UScript.LATIN), scriptAtt.getName());
assertEquals(UScript.getShortName(UScript.LATIN), scriptAtt.getShortName());
assertTrue(ts.reflectAsString(false).contains("script=Latin"));
}
ts.end();
ts.close();
}
} }

View File

@ -22,8 +22,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase { public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() { private Analyzer analyzer = new Analyzer() {
@ -40,6 +43,21 @@ public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
); );
} }
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("あり"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.DEFAULT_MODE);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new JapaneseBaseFormFilter(sink));
}
};
assertAnalyzesTo(a, "それはまだ実験段階にあります",
new String[] { "それ", "", "まだ", "実験", "段階", "", "あり", "ます" }
);
}
public void testEnglish() throws IOException { public void testEnglish() throws IOException {
assertAnalyzesTo(analyzer, "this atest", assertAnalyzesTo(analyzer, "this atest",
new String[] { "this", "atest" }); new String[] { "this", "atest" });

View File

@ -20,8 +20,11 @@ package org.apache.lucene.analysis.ja;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
@ -61,6 +64,19 @@ public class TestJapaneseKatakanaStemFilter extends BaseTokenStreamTestCase {
new int[] { 3, 8, 13, 19, 24, 29 }); new int[] { 3, 8, 13, 19, 24, 29 });
} }
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("コーヒー"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new JapaneseKatakanaStemFilter(sink));
}
};
checkOneTerm(a, "コーヒー", "コーヒー");
}
public void testUnsupportedHalfWidthVariants() throws IOException { public void testUnsupportedHalfWidthVariants() throws IOException {
// The below result is expected since only full-width katakana is supported // The below result is expected since only full-width katakana is supported
assertAnalyzesTo(analyzer, "タクシー", new String[] { "タクシー" }); assertAnalyzesTo(analyzer, "タクシー", new String[] { "タクシー" });