mirror of
https://github.com/apache/lucene.git
synced 2025-03-06 08:19:23 +00:00
test some untested analysis corner cases
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1361896 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
dc27d24e0b
commit
7002e7cb09
@ -0,0 +1,67 @@
|
||||
package org.apache.lucene.analysis.cjk;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
||||
public class TestCJKBigramFilter extends BaseTokenStreamTestCase {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new TokenStreamComponents(t, new CJKBigramFilter(t));
|
||||
}
|
||||
};
|
||||
|
||||
public void testHuge() throws Exception {
|
||||
assertAnalyzesTo(analyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
|
||||
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
|
||||
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた",
|
||||
new String[] {
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた"
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
public void testHanOnly() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN));
|
||||
}
|
||||
};
|
||||
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
|
||||
new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" });
|
||||
}
|
||||
}
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
||||
@ -46,6 +49,19 @@ public class TestGermanLightStemFilter extends BaseTokenStreamTestCase {
|
||||
assertVocabulary(analyzer, getDataFile("delighttestdata.zip"), "delight.txt");
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new GermanLightStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "sängerinnen", "sängerinnen");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
||||
@ -53,6 +56,19 @@ public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||
checkOneTerm(analyzer, "äpfel", "apfel");
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new GermanMinimalStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "sängerinnen", "sängerinnen");
|
||||
}
|
||||
|
||||
/** Test against a vocabulary from the reference impl */
|
||||
public void testVocabulary() throws IOException {
|
||||
assertVocabulary(analyzer, getDataFile("deminimaltestdata.zip"), "deminimal.txt");
|
||||
|
@ -23,9 +23,13 @@ import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
||||
@ -58,6 +62,19 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
||||
assertAnalyzesTo(analyzer, "", new String[] { "" });
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new GermanStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "sängerinnen", "sängerinnen");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
||||
@ -46,6 +49,19 @@ public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase {
|
||||
assertVocabulary(analyzer, getDataFile("filighttestdata.zip"), "filight.txt");
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("edeltäjistään"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new FinnishLightStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "edeltäjistään", "edeltäjistään");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
||||
@ -176,6 +179,19 @@ public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase {
|
||||
assertVocabulary(analyzer, getDataFile("frlighttestdata.zip"), "frlight.txt");
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("chevaux"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new FrenchLightStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "chevaux", "chevaux");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
||||
@ -55,6 +58,19 @@ public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||
checkOneTerm(analyzer, "baron", "baron");
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("chevaux"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new FrenchMinimalStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "chevaux", "chevaux");
|
||||
}
|
||||
|
||||
/** Test against a vocabulary from the reference impl */
|
||||
public void testVocabulary() throws IOException {
|
||||
assertVocabulary(analyzer, getDataFile("frminimaltestdata.zip"), "frminimal.txt");
|
||||
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/**
|
||||
* Simple tests for {@link GalicianMinimalStemmer}
|
||||
@ -50,6 +53,19 @@ public class TestGalicianMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||
checkOneTerm(a, "barcelonês", "barcelonês");
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("elefantes"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new GalicianMinimalStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "elefantes", "elefantes");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
||||
@ -46,6 +49,19 @@ public class TestHungarianLightStemFilter extends BaseTokenStreamTestCase {
|
||||
assertVocabulary(analyzer, getDataFile("hulighttestdata.zip"), "hulight.txt");
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("babakocsi"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new HungarianLightStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "babakocsi", "babakocsi");
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
@ -147,6 +148,12 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
||||
checkOneTerm(a, "fiets", "fiets");
|
||||
}
|
||||
|
||||
public void testEmptyStemDictionary() throws IOException {
|
||||
DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET,
|
||||
CharArraySet.EMPTY_SET, CharArrayMap.<String>emptyMap());
|
||||
checkOneTerm(a, "fiets", "fiet");
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that stopwords are not case sensitive
|
||||
*/
|
||||
|
@ -25,8 +25,11 @@ import java.util.Random;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
||||
@ -47,6 +50,19 @@ public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase {
|
||||
public void testVocabulary() throws IOException {
|
||||
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_light.txt")));
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sekretæren"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new NorwegianLightStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "sekretæren", "sekretæren");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
|
@ -25,8 +25,11 @@ import java.util.Random;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
||||
@ -47,6 +50,19 @@ public class TestNorwegianMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||
public void testVocabulary() throws IOException {
|
||||
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_minimal.txt")));
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sekretæren"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "sekretæren", "sekretæren");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
|
@ -22,11 +22,15 @@ import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
||||
@ -93,6 +97,19 @@ public class TestPortugueseLightStemFilter extends BaseTokenStreamTestCase {
|
||||
assertVocabulary(analyzer, getDataFile("ptlighttestdata.zip"), "ptlight.txt");
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new PortugueseLightStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "quilométricas", "quilométricas");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
|
@ -22,11 +22,15 @@ import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
||||
@ -67,6 +71,19 @@ public class TestPortugueseMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||
assertVocabulary(analyzer, getDataFile("ptminimaltestdata.zip"), "ptminimal.txt");
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "quilométricas", "quilométricas");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
|
@ -24,11 +24,14 @@ import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/**
|
||||
* Simple tests for {@link PortugueseStemFilter}
|
||||
@ -67,6 +70,19 @@ public class TestPortugueseStemFilter extends BaseTokenStreamTestCase {
|
||||
assertVocabulary(analyzer, getDataFile("ptrslptestdata.zip"), "ptrslp.txt");
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new PortugueseStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "quilométricas", "quilométricas");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
||||
@ -46,6 +49,19 @@ public class TestRussianLightStemFilter extends BaseTokenStreamTestCase {
|
||||
assertVocabulary(analyzer, getDataFile("rulighttestdata.zip"), "rulight.txt");
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("энергии"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new RussianLightStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "энергии", "энергии");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
||||
@ -46,6 +49,19 @@ public class TestSwedishLightStemFilter extends BaseTokenStreamTestCase {
|
||||
assertVocabulary(analyzer, getDataFile("svlighttestdata.zip"), "svlight.txt");
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("jaktkarlens"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new SwedishLightStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "jaktkarlens", "jaktkarlens");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
|
@ -68,6 +68,13 @@ public class TestTurkishLowerCaseFilter extends BaseTokenStreamTestCase {
|
||||
"\u0131\u0316sparta",});
|
||||
}
|
||||
|
||||
public void testDecomposed3() throws Exception {
|
||||
TokenStream stream = new MockTokenizer(new StringReader(
|
||||
"\u0049\u0307"), MockTokenizer.WHITESPACE, false);
|
||||
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
|
||||
assertTokenStreamContents(filter, new String[] {"i"});
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
|
@ -20,8 +20,12 @@ package org.apache.lucene.analysis.icu.segmentation;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
|
||||
import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
|
||||
|
||||
import com.ibm.icu.lang.UScript;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
@ -243,4 +247,18 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
||||
Random random = random();
|
||||
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192);
|
||||
}
|
||||
|
||||
public void testTokenAttributes() throws Exception {
|
||||
TokenStream ts = a.tokenStream("dummy", new StringReader("This is a test"));
|
||||
ScriptAttribute scriptAtt = ts.addAttribute(ScriptAttribute.class);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
assertEquals(UScript.LATIN, scriptAtt.getCode());
|
||||
assertEquals(UScript.getName(UScript.LATIN), scriptAtt.getName());
|
||||
assertEquals(UScript.getShortName(UScript.LATIN), scriptAtt.getShortName());
|
||||
assertTrue(ts.reflectAsString(false).contains("script=Latin"));
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
}
|
||||
|
@ -22,8 +22,11 @@ import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
|
||||
private Analyzer analyzer = new Analyzer() {
|
||||
@ -40,6 +43,21 @@ public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
|
||||
);
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("あり"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.DEFAULT_MODE);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new JapaneseBaseFormFilter(sink));
|
||||
}
|
||||
};
|
||||
assertAnalyzesTo(a, "それはまだ実験段階にあります",
|
||||
new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます" }
|
||||
);
|
||||
}
|
||||
|
||||
public void testEnglish() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "this atest",
|
||||
new String[] { "this", "atest" });
|
||||
|
@ -20,8 +20,11 @@ package org.apache.lucene.analysis.ja;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
@ -60,6 +63,19 @@ public class TestJapaneseKatakanaStemFilter extends BaseTokenStreamTestCase {
|
||||
new int[] { 0, 4, 9, 14, 20, 25 },
|
||||
new int[] { 3, 8, 13, 19, 24, 29 });
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("コーヒー"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new JapaneseKatakanaStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "コーヒー", "コーヒー");
|
||||
}
|
||||
|
||||
public void testUnsupportedHalfWidthVariants() throws IOException {
|
||||
// The below result is expected since only full-width katakana is supported
|
||||
|
Loading…
x
Reference in New Issue
Block a user