mirror of https://github.com/apache/lucene.git
test some untested analysis corner cases
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1361896 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
dc27d24e0b
commit
7002e7cb09
|
@ -0,0 +1,67 @@
|
||||||
|
package org.apache.lucene.analysis.cjk;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
|
||||||
|
public class TestCJKBigramFilter extends BaseTokenStreamTestCase {
|
||||||
|
Analyzer analyzer = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
|
return new TokenStreamComponents(t, new CJKBigramFilter(t));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
public void testHuge() throws Exception {
|
||||||
|
assertAnalyzesTo(analyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
|
||||||
|
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
|
||||||
|
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた",
|
||||||
|
new String[] {
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた", "た多",
|
||||||
|
"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた"
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testHanOnly() throws Exception {
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
|
return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
|
||||||
|
new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" });
|
||||||
|
}
|
||||||
|
}
|
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||||
|
|
||||||
|
@ -46,6 +49,19 @@ public class TestGermanLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
assertVocabulary(analyzer, getDataFile("delighttestdata.zip"), "delight.txt");
|
assertVocabulary(analyzer, getDataFile("delighttestdata.zip"), "delight.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new GermanLightStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "sängerinnen", "sängerinnen");
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
|
|
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||||
|
|
||||||
|
@ -53,6 +56,19 @@ public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
checkOneTerm(analyzer, "äpfel", "apfel");
|
checkOneTerm(analyzer, "äpfel", "apfel");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new GermanMinimalStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "sängerinnen", "sängerinnen");
|
||||||
|
}
|
||||||
|
|
||||||
/** Test against a vocabulary from the reference impl */
|
/** Test against a vocabulary from the reference impl */
|
||||||
public void testVocabulary() throws IOException {
|
public void testVocabulary() throws IOException {
|
||||||
assertVocabulary(analyzer, getDataFile("deminimaltestdata.zip"), "deminimal.txt");
|
assertVocabulary(analyzer, getDataFile("deminimaltestdata.zip"), "deminimal.txt");
|
||||||
|
|
|
@ -23,9 +23,13 @@ import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||||
|
|
||||||
|
@ -58,6 +62,19 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
||||||
assertAnalyzesTo(analyzer, "", new String[] { "" });
|
assertAnalyzesTo(analyzer, "", new String[] { "" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new GermanStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "sängerinnen", "sängerinnen");
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
|
|
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||||
|
|
||||||
|
@ -46,6 +49,19 @@ public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
assertVocabulary(analyzer, getDataFile("filighttestdata.zip"), "filight.txt");
|
assertVocabulary(analyzer, getDataFile("filighttestdata.zip"), "filight.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("edeltäjistään"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new FinnishLightStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "edeltäjistään", "edeltäjistään");
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
|
|
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||||
|
|
||||||
|
@ -176,6 +179,19 @@ public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
assertVocabulary(analyzer, getDataFile("frlighttestdata.zip"), "frlight.txt");
|
assertVocabulary(analyzer, getDataFile("frlighttestdata.zip"), "frlight.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("chevaux"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new FrenchLightStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "chevaux", "chevaux");
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
|
|
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||||
|
|
||||||
|
@ -55,6 +58,19 @@ public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
checkOneTerm(analyzer, "baron", "baron");
|
checkOneTerm(analyzer, "baron", "baron");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("chevaux"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new FrenchMinimalStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "chevaux", "chevaux");
|
||||||
|
}
|
||||||
|
|
||||||
/** Test against a vocabulary from the reference impl */
|
/** Test against a vocabulary from the reference impl */
|
||||||
public void testVocabulary() throws IOException {
|
public void testVocabulary() throws IOException {
|
||||||
assertVocabulary(analyzer, getDataFile("frminimaltestdata.zip"), "frminimal.txt");
|
assertVocabulary(analyzer, getDataFile("frminimaltestdata.zip"), "frminimal.txt");
|
||||||
|
|
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Simple tests for {@link GalicianMinimalStemmer}
|
* Simple tests for {@link GalicianMinimalStemmer}
|
||||||
|
@ -50,6 +53,19 @@ public class TestGalicianMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
checkOneTerm(a, "barcelonês", "barcelonês");
|
checkOneTerm(a, "barcelonês", "barcelonês");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("elefantes"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new GalicianMinimalStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "elefantes", "elefantes");
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||||
|
|
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||||
|
|
||||||
|
@ -46,6 +49,19 @@ public class TestHungarianLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
assertVocabulary(analyzer, getDataFile("hulighttestdata.zip"), "hulight.txt");
|
assertVocabulary(analyzer, getDataFile("hulighttestdata.zip"), "hulight.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("babakocsi"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new HungarianLightStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "babakocsi", "babakocsi");
|
||||||
|
}
|
||||||
|
|
||||||
public void testEmptyTerm() throws IOException {
|
public void testEmptyTerm() throws IOException {
|
||||||
Analyzer a = new Analyzer() {
|
Analyzer a = new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
@ -147,6 +148,12 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
||||||
checkOneTerm(a, "fiets", "fiets");
|
checkOneTerm(a, "fiets", "fiets");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testEmptyStemDictionary() throws IOException {
|
||||||
|
DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET,
|
||||||
|
CharArraySet.EMPTY_SET, CharArrayMap.<String>emptyMap());
|
||||||
|
checkOneTerm(a, "fiets", "fiet");
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test that stopwords are not case sensitive
|
* Test that stopwords are not case sensitive
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -25,8 +25,11 @@ import java.util.Random;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||||
|
|
||||||
|
@ -48,6 +51,19 @@ public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_light.txt")));
|
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_light.txt")));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sekretæren"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new NorwegianLightStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "sekretæren", "sekretæren");
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
Random random = random();
|
Random random = random();
|
||||||
|
|
|
@ -25,8 +25,11 @@ import java.util.Random;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||||
|
|
||||||
|
@ -48,6 +51,19 @@ public class TestNorwegianMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_minimal.txt")));
|
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_minimal.txt")));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sekretæren"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "sekretæren", "sekretæren");
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
Random random = random();
|
Random random = random();
|
||||||
|
|
|
@ -22,11 +22,15 @@ import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||||
|
|
||||||
|
@ -93,6 +97,19 @@ public class TestPortugueseLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
assertVocabulary(analyzer, getDataFile("ptlighttestdata.zip"), "ptlight.txt");
|
assertVocabulary(analyzer, getDataFile("ptlighttestdata.zip"), "ptlight.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new PortugueseLightStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "quilométricas", "quilométricas");
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
|
|
|
@ -22,11 +22,15 @@ import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||||
|
|
||||||
|
@ -67,6 +71,19 @@ public class TestPortugueseMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
assertVocabulary(analyzer, getDataFile("ptminimaltestdata.zip"), "ptminimal.txt");
|
assertVocabulary(analyzer, getDataFile("ptminimaltestdata.zip"), "ptminimal.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "quilométricas", "quilométricas");
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
|
|
|
@ -24,11 +24,14 @@ import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Simple tests for {@link PortugueseStemFilter}
|
* Simple tests for {@link PortugueseStemFilter}
|
||||||
|
@ -67,6 +70,19 @@ public class TestPortugueseStemFilter extends BaseTokenStreamTestCase {
|
||||||
assertVocabulary(analyzer, getDataFile("ptrslptestdata.zip"), "ptrslp.txt");
|
assertVocabulary(analyzer, getDataFile("ptrslptestdata.zip"), "ptrslp.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new PortugueseStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "quilométricas", "quilométricas");
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
|
|
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||||
|
|
||||||
|
@ -46,6 +49,19 @@ public class TestRussianLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
assertVocabulary(analyzer, getDataFile("rulighttestdata.zip"), "rulight.txt");
|
assertVocabulary(analyzer, getDataFile("rulighttestdata.zip"), "rulight.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("энергии"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new RussianLightStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "энергии", "энергии");
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
|
|
|
@ -23,8 +23,11 @@ import java.io.Reader;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||||
|
|
||||||
|
@ -46,6 +49,19 @@ public class TestSwedishLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
assertVocabulary(analyzer, getDataFile("svlighttestdata.zip"), "svlight.txt");
|
assertVocabulary(analyzer, getDataFile("svlighttestdata.zip"), "svlight.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("jaktkarlens"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new SwedishLightStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "jaktkarlens", "jaktkarlens");
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
|
|
|
@ -68,6 +68,13 @@ public class TestTurkishLowerCaseFilter extends BaseTokenStreamTestCase {
|
||||||
"\u0131\u0316sparta",});
|
"\u0131\u0316sparta",});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testDecomposed3() throws Exception {
|
||||||
|
TokenStream stream = new MockTokenizer(new StringReader(
|
||||||
|
"\u0049\u0307"), MockTokenizer.WHITESPACE, false);
|
||||||
|
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
|
||||||
|
assertTokenStreamContents(filter, new String[] {"i"});
|
||||||
|
}
|
||||||
|
|
||||||
public void testEmptyTerm() throws IOException {
|
public void testEmptyTerm() throws IOException {
|
||||||
Analyzer a = new Analyzer() {
|
Analyzer a = new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -20,8 +20,12 @@ package org.apache.lucene.analysis.icu.segmentation;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
|
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
|
||||||
|
import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
|
||||||
|
|
||||||
|
import com.ibm.icu.lang.UScript;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
@ -243,4 +247,18 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
||||||
Random random = random();
|
Random random = random();
|
||||||
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192);
|
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testTokenAttributes() throws Exception {
|
||||||
|
TokenStream ts = a.tokenStream("dummy", new StringReader("This is a test"));
|
||||||
|
ScriptAttribute scriptAtt = ts.addAttribute(ScriptAttribute.class);
|
||||||
|
ts.reset();
|
||||||
|
while (ts.incrementToken()) {
|
||||||
|
assertEquals(UScript.LATIN, scriptAtt.getCode());
|
||||||
|
assertEquals(UScript.getName(UScript.LATIN), scriptAtt.getName());
|
||||||
|
assertEquals(UScript.getShortName(UScript.LATIN), scriptAtt.getShortName());
|
||||||
|
assertTrue(ts.reflectAsString(false).contains("script=Latin"));
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
|
ts.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,8 +22,11 @@ import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
|
public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
|
||||||
private Analyzer analyzer = new Analyzer() {
|
private Analyzer analyzer = new Analyzer() {
|
||||||
|
@ -40,6 +43,21 @@ public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("あり"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.DEFAULT_MODE);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new JapaneseBaseFormFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
assertAnalyzesTo(a, "それはまだ実験段階にあります",
|
||||||
|
new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます" }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
public void testEnglish() throws IOException {
|
public void testEnglish() throws IOException {
|
||||||
assertAnalyzesTo(analyzer, "this atest",
|
assertAnalyzesTo(analyzer, "this atest",
|
||||||
new String[] { "this", "atest" });
|
new String[] { "this", "atest" });
|
||||||
|
|
|
@ -20,8 +20,11 @@ package org.apache.lucene.analysis.ja;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
@ -61,6 +64,19 @@ public class TestJapaneseKatakanaStemFilter extends BaseTokenStreamTestCase {
|
||||||
new int[] { 3, 8, 13, 19, 24, 29 });
|
new int[] { 3, 8, 13, 19, 24, 29 });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("コーヒー"), false);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new JapaneseKatakanaStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "コーヒー", "コーヒー");
|
||||||
|
}
|
||||||
|
|
||||||
public void testUnsupportedHalfWidthVariants() throws IOException {
|
public void testUnsupportedHalfWidthVariants() throws IOException {
|
||||||
// The below result is expected since only full-width katakana is supported
|
// The below result is expected since only full-width katakana is supported
|
||||||
assertAnalyzesTo(analyzer, "タクシー", new String[] { "タクシー" });
|
assertAnalyzesTo(analyzer, "タクシー", new String[] { "タクシー" });
|
||||||
|
|
Loading…
Reference in New Issue