mirror of https://github.com/apache/lucene.git
LUCENE-5642: Randomize attributeFactory in tests, use MockTokenizer more where possible, reduce use of esoteric Token ctors
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1592339 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d67884a07c
commit
94c513ad5b
|
@ -34,8 +34,7 @@ public class TestArabicFilters extends BaseTokenStreamFactoryTestCase {
|
|||
*/
|
||||
public void testNormalizer() throws Exception {
|
||||
Reader reader = new StringReader("الذين مَلكت أيمانكم");
|
||||
Tokenizer tokenizer = tokenizerFactory("Standard").create();
|
||||
tokenizer.setReader(reader);
|
||||
Tokenizer tokenizer = whitespaceMockTokenizer(reader);
|
||||
TokenStream stream = tokenFilterFactory("ArabicNormalization").create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] {"الذين", "ملكت", "ايمانكم"});
|
||||
}
|
||||
|
@ -45,8 +44,7 @@ public class TestArabicFilters extends BaseTokenStreamFactoryTestCase {
|
|||
*/
|
||||
public void testStemmer() throws Exception {
|
||||
Reader reader = new StringReader("الذين مَلكت أيمانكم");
|
||||
Tokenizer tokenizer = tokenizerFactory("Standard").create();
|
||||
tokenizer.setReader(reader);
|
||||
Tokenizer tokenizer = whitespaceMockTokenizer(reader);
|
||||
TokenStream stream = tokenFilterFactory("ArabicNormalization").create(tokenizer);
|
||||
stream = tokenFilterFactory("ArabicStem").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] {"ذين", "ملكت", "ايمانكم"});
|
||||
|
@ -57,8 +55,7 @@ public class TestArabicFilters extends BaseTokenStreamFactoryTestCase {
|
|||
*/
|
||||
public void testPersianCharFilter() throws Exception {
|
||||
Reader reader = charFilterFactory("Persian").create(new StringReader("میخورد"));
|
||||
Tokenizer tokenizer = tokenizerFactory("Standard").create();
|
||||
tokenizer.setReader(reader);
|
||||
Tokenizer tokenizer = whitespaceMockTokenizer(reader);
|
||||
assertTokenStreamContents(tokenizer, new String[] { "می", "خورد" });
|
||||
}
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
|
||||
|
@ -32,7 +33,7 @@ public class TestSoraniNormalizationFilter extends BaseTokenStreamTestCase {
|
|||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer();
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
|
||||
return new TokenStreamComponents(tokenizer, new SoraniNormalizationFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
|
@ -87,6 +88,13 @@ public class TestSoraniNormalizationFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer();
|
||||
return new TokenStreamComponents(tokenizer, new SoraniNormalizationFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -180,7 +180,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
|
||||
CharArraySet dict = makeDictionary("ab", "cd", "ef");
|
||||
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT);
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
tokenizer.setReader(new StringReader("abcdef"));
|
||||
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
|
||||
tokenizer,
|
||||
|
@ -200,7 +200,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
public void testWordComponentWithLessThanMinimumLength() throws Exception {
|
||||
CharArraySet dict = makeDictionary("abc", "d", "efg");
|
||||
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT);
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
tokenizer.setReader(new StringReader("abcdefg"));
|
||||
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
|
||||
tokenizer,
|
||||
|
@ -222,7 +222,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
CharArraySet dict = makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz",
|
||||
"Aufgabe", "Überwachung");
|
||||
|
||||
Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT);
|
||||
MockTokenizer wsTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
wsTokenizer.setEnableChecks(false); // we will reset in a strange place
|
||||
wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
|
||||
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
|
||||
wsTokenizer, dict,
|
||||
|
@ -246,7 +247,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testRetainMockAttribute() throws Exception {
|
||||
CharArraySet dict = makeDictionary("abc", "d", "efg");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT);
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
tokenizer.setReader(new StringReader("abcdefg"));
|
||||
TokenStream stream = new MockRetainAttributeFilter(tokenizer);
|
||||
stream = new DictionaryCompoundWordTokenFilter(
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.StringReader;
|
|||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockReaderWrapper;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -29,7 +30,6 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.BasicOperations;
|
||||
|
@ -44,7 +44,7 @@ import org.apache.lucene.util.automaton.Transition;
|
|||
* Any tests here need to probably consider unicode version of the JRE (it could
|
||||
* cause false fails).
|
||||
*/
|
||||
public class TestDuelingAnalyzers extends LuceneTestCase {
|
||||
public class TestDuelingAnalyzers extends BaseTokenStreamTestCase {
|
||||
private CharacterRunAutomaton jvmLetter;
|
||||
|
||||
@Override
|
||||
|
@ -71,7 +71,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
|||
Analyzer right = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT);
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
@ -91,7 +91,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
|||
Analyzer right = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT);
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
@ -109,7 +109,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
|||
Analyzer right = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT);
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
@ -128,7 +128,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
|||
Analyzer right = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT);
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
@ -146,7 +146,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
|||
Analyzer right = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT);
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
@ -165,7 +165,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
|
|||
Analyzer right = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT);
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -179,7 +179,7 @@ public class TestFactories extends BaseTokenStreamTestCase {
|
|||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tf = tokenizer.create();
|
||||
Tokenizer tf = tokenizer.create(newAttributeFactory());
|
||||
if (tokenfilter != null) {
|
||||
return new TokenStreamComponents(tf, tokenfilter.create(tf));
|
||||
} else {
|
||||
|
|
|
@ -355,6 +355,11 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
return TEST_VERSION_CURRENT;
|
||||
}
|
||||
});
|
||||
put(AttributeFactory.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
return newAttributeFactory(random);
|
||||
}
|
||||
});
|
||||
put(Set.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
// TypeTokenFilter
|
||||
|
@ -582,10 +587,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
Object[] args = new Object[paramTypes.length];
|
||||
for (int i = 0; i < args.length; i++) {
|
||||
Class<?> paramType = paramTypes[i];
|
||||
if (paramType == AttributeFactory.class) {
|
||||
// TODO: maybe the collator one...???
|
||||
args[i] = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
|
||||
} else if (paramType == AttributeSource.class) {
|
||||
if (paramType == AttributeSource.class) {
|
||||
// TODO: args[i] = new AttributeSource();
|
||||
// this is currently too scary to deal with!
|
||||
args[i] = null; // force IAE
|
||||
|
|
|
@ -50,7 +50,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
|
||||
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT);
|
||||
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
};
|
||||
|
@ -298,7 +298,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
|||
new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT);
|
||||
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
|
||||
TokenStream tokenStream = new MockGraphTokenFilter(random(), tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, tokenStream);
|
||||
}
|
||||
|
|
|
@ -36,7 +36,7 @@ public class TestTypeTokenFilter extends BaseTokenStreamTestCase {
|
|||
public void testTypeFilter() throws IOException {
|
||||
StringReader reader = new StringReader("121 is palindrome, while 123 is not");
|
||||
Set<String> stopTypes = asSet("<NUM>");
|
||||
final StandardTokenizer input = new StandardTokenizer(TEST_VERSION_CURRENT);
|
||||
final StandardTokenizer input = new StandardTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
|
||||
input.setReader(reader);
|
||||
TokenStream stream = new TypeTokenFilter(TEST_VERSION_CURRENT, input, stopTypes);
|
||||
assertTokenStreamContents(stream, new String[]{"is", "palindrome", "while", "is", "not"});
|
||||
|
@ -85,7 +85,7 @@ public class TestTypeTokenFilter extends BaseTokenStreamTestCase {
|
|||
public void testTypeFilterWhitelist() throws IOException {
|
||||
StringReader reader = new StringReader("121 is palindrome, while 123 is not");
|
||||
Set<String> stopTypes = Collections.singleton("<NUM>");
|
||||
final StandardTokenizer input = new StandardTokenizer(TEST_VERSION_CURRENT);
|
||||
final StandardTokenizer input = new StandardTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
|
||||
input.setReader(reader);
|
||||
TokenStream stream = new TypeTokenFilter(TEST_VERSION_CURRENT, input, stopTypes, true);
|
||||
assertTokenStreamContents(stream, new String[]{"121", "123"});
|
||||
|
|
|
@ -47,7 +47,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
sb.append(whitespace);
|
||||
sb.append("testing 1234");
|
||||
String input = sb.toString();
|
||||
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT);
|
||||
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
|
||||
tokenizer.setReader(new StringReader(input));
|
||||
BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
|
||||
}
|
||||
|
@ -56,7 +56,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
|
||||
Tokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT);
|
||||
Tokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
};
|
||||
|
@ -103,7 +103,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
private Analyzer urlAnalyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT);
|
||||
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
|
||||
tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs
|
||||
TokenFilter filter = new URLFilter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
|
@ -113,7 +113,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
private Analyzer emailAnalyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT);
|
||||
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
|
||||
TokenFilter filter = new EmailFilter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
|
|
|
@ -42,7 +42,7 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
|||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer t = new KeywordTokenizer();
|
||||
Tokenizer t = new MockTokenizer(MockTokenizer.KEYWORD, false);
|
||||
return new TokenStreamComponents(t,
|
||||
new GermanStemFilter(new LowerCaseFilter(TEST_VERSION_CURRENT, t)));
|
||||
}
|
||||
|
@ -54,12 +54,6 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
|||
vocOut.close();
|
||||
}
|
||||
|
||||
// LUCENE-3043: we use keywordtokenizer in this test,
|
||||
// so ensure the stemmer does not crash on zero-length strings.
|
||||
public void testEmpty() throws Exception {
|
||||
assertAnalyzesTo(analyzer, "", new String[] { "" });
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
|
@ -37,9 +38,8 @@ public class TestGalicianStemFilter extends BaseTokenStreamTestCase {
|
|||
private Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT);
|
||||
TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
|
||||
return new TokenStreamComponents(source, new GalicianStemFilter(result));
|
||||
Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new GalicianStemFilter(source));
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -34,8 +34,7 @@ public class TestHindiFilters extends BaseTokenStreamFactoryTestCase {
|
|||
*/
|
||||
public void testIndicNormalizer() throws Exception {
|
||||
Reader reader = new StringReader("ত্ अाैर");
|
||||
TokenStream stream = tokenizerFactory("Standard").create();
|
||||
((Tokenizer)stream).setReader(reader);
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = tokenFilterFactory("IndicNormalization").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "ৎ", "और" });
|
||||
}
|
||||
|
@ -45,8 +44,7 @@ public class TestHindiFilters extends BaseTokenStreamFactoryTestCase {
|
|||
*/
|
||||
public void testHindiNormalizer() throws Exception {
|
||||
Reader reader = new StringReader("क़िताब");
|
||||
TokenStream stream = tokenizerFactory("Standard").create();
|
||||
((Tokenizer)stream).setReader(reader);
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = tokenFilterFactory("IndicNormalization").create(stream);
|
||||
stream = tokenFilterFactory("HindiNormalization").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] {"किताब"});
|
||||
|
@ -57,8 +55,7 @@ public class TestHindiFilters extends BaseTokenStreamFactoryTestCase {
|
|||
*/
|
||||
public void testStemmer() throws Exception {
|
||||
Reader reader = new StringReader("किताबें");
|
||||
TokenStream stream = tokenizerFactory("Standard").create();
|
||||
((Tokenizer)stream).setReader(reader);
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = tokenFilterFactory("IndicNormalization").create(stream);
|
||||
stream = tokenFilterFactory("HindiNormalization").create(stream);
|
||||
stream = tokenFilterFactory("HindiStem").create(stream);
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
|
||||
|
@ -33,7 +34,7 @@ public class TestIndonesianStemmer extends BaseTokenStreamTestCase {
|
|||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer();
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
|
||||
return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
|
@ -114,7 +115,7 @@ public class TestIndonesianStemmer extends BaseTokenStreamTestCase {
|
|||
Analyzer b = new Analyzer() {
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer();
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
|
||||
return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer, false));
|
||||
}
|
||||
};
|
||||
|
|
|
@ -37,7 +37,8 @@ public class TestSingleTokenTokenFilter extends LuceneTestCase {
|
|||
assertEquals(token, tokenAtt);
|
||||
assertFalse(ts.incrementToken());
|
||||
|
||||
token = new Token("hallo", 10, 20, "someType");
|
||||
token = new Token("hallo", 10, 20);
|
||||
token.setType("someType");
|
||||
ts.setToken(token);
|
||||
ts.reset();
|
||||
|
||||
|
|
|
@ -46,11 +46,11 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
|
|||
char[] whitespace = " ".toCharArray();
|
||||
char[] empty = "".toCharArray();
|
||||
|
||||
TokenStream ts = new IterTokenStream(new Token(a, 0, a.length, 1, 5),
|
||||
new Token(b, 0, b.length, 6, 10),
|
||||
new Token(ccc, 0, ccc.length, 11, 15),
|
||||
new Token(whitespace, 0, whitespace.length, 16, 20),
|
||||
new Token(empty, 0, empty.length, 21, 21));
|
||||
TokenStream ts = new IterTokenStream(new Token(new String(a, 0, a.length), 1, 5),
|
||||
new Token(new String(b, 0, b.length), 6, 10),
|
||||
new Token(new String(ccc, 0, ccc.length), 11, 15),
|
||||
new Token(new String(whitespace, 0, whitespace.length), 16, 20),
|
||||
new Token(new String(empty, 0, empty.length), 21, 21));
|
||||
ts = new TrimFilter(TEST_VERSION_CURRENT, ts);
|
||||
|
||||
assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""});
|
||||
|
|
|
@ -355,7 +355,8 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, protectedWords));
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
// TODO: properly support positionLengthAttribute
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -379,7 +380,8 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, protectedWords));
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192);
|
||||
// TODO: properly support positionLengthAttribute
|
||||
checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192, false, false);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -27,11 +27,14 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
|
||||
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
||||
|
||||
import static org.apache.lucene.analysis.path.PathHierarchyTokenizer.DEFAULT_DELIMITER;
|
||||
import static org.apache.lucene.analysis.path.PathHierarchyTokenizer.DEFAULT_SKIP;
|
||||
|
||||
public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testBasic() throws Exception {
|
||||
String path = "/a/b/c";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer();
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
|
||||
t.setReader(new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"/a", "/a/b", "/a/b/c"},
|
||||
|
@ -43,7 +46,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testEndOfDelimiter() throws Exception {
|
||||
String path = "/a/b/c/";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer();
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
|
||||
t.setReader( new StringReader(path) );
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"/a", "/a/b", "/a/b/c", "/a/b/c/"},
|
||||
|
@ -55,7 +58,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testStartOfChar() throws Exception {
|
||||
String path = "a/b/c";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer();
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
|
||||
t.setReader( new StringReader(path) );
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"a", "a/b", "a/b/c"},
|
||||
|
@ -67,7 +70,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testStartOfCharEndOfDelimiter() throws Exception {
|
||||
String path = "a/b/c/";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( );
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
|
||||
t.setReader( new StringReader(path) );
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"a", "a/b", "a/b/c", "a/b/c/"},
|
||||
|
@ -79,7 +82,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testOnlyDelimiter() throws Exception {
|
||||
String path = "/";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( );
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
|
||||
t.setReader( new StringReader(path) );
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"/"},
|
||||
|
@ -91,7 +94,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testOnlyDelimiters() throws Exception {
|
||||
String path = "//";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( );
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
|
||||
t.setReader(new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"/", "//"},
|
||||
|
@ -103,7 +106,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testReplace() throws Exception {
|
||||
String path = "/a/b/c";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( '/', '\\' );
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), '/', '\\', DEFAULT_SKIP);
|
||||
t.setReader(new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"\\a", "\\a\\b", "\\a\\b\\c"},
|
||||
|
@ -115,7 +118,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testWindowsPath() throws Exception {
|
||||
String path = "c:\\a\\b\\c";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( '\\', '\\' );
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), '\\', '\\', DEFAULT_SKIP);
|
||||
t.setReader(new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"c:", "c:\\a", "c:\\a\\b", "c:\\a\\b\\c"},
|
||||
|
@ -131,7 +134,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
NormalizeCharMap normMap = builder.build();
|
||||
String path = "c:\\a\\b\\c";
|
||||
Reader cs = new MappingCharFilter(normMap, new StringReader(path));
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( );
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
|
||||
t.setReader(cs);
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"c:", "c:/a", "c:/a/b", "c:/a/b/c"},
|
||||
|
@ -143,7 +146,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testBasicSkip() throws Exception {
|
||||
String path = "/a/b/c";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( 1 );
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
|
||||
t.setReader(new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"/b", "/b/c"},
|
||||
|
@ -155,7 +158,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testEndOfDelimiterSkip() throws Exception {
|
||||
String path = "/a/b/c/";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( 1 );
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
|
||||
t.setReader(new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"/b", "/b/c", "/b/c/"},
|
||||
|
@ -167,7 +170,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testStartOfCharSkip() throws Exception {
|
||||
String path = "a/b/c";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( 1 );
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
|
||||
t.setReader(new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"/b", "/b/c"},
|
||||
|
@ -179,7 +182,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testStartOfCharEndOfDelimiterSkip() throws Exception {
|
||||
String path = "a/b/c/";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer(1 );
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
|
||||
t.setReader(new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"/b", "/b/c", "/b/c/"},
|
||||
|
@ -191,7 +194,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testOnlyDelimiterSkip() throws Exception {
|
||||
String path = "/";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( 1 );
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
|
||||
t.setReader(new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{},
|
||||
|
@ -203,7 +206,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testOnlyDelimitersSkip() throws Exception {
|
||||
String path = "//";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( 1 );
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
|
||||
t.setReader( new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"/"},
|
||||
|
@ -218,11 +221,12 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new PathHierarchyTokenizer();
|
||||
Tokenizer tokenizer = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
// TODO: properly support positionLengthAttribute
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
|
||||
}
|
||||
|
||||
/** blast some random large strings through the analyzer */
|
||||
|
@ -231,10 +235,11 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new PathHierarchyTokenizer();
|
||||
Tokenizer tokenizer = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 1027);
|
||||
// TODO: properly support positionLengthAttribute
|
||||
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 1027, false, false);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,11 +26,14 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
|||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
|
||||
|
||||
import static org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer.DEFAULT_DELIMITER;
|
||||
import static org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer.DEFAULT_SKIP;
|
||||
|
||||
public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testBasicReverse() throws Exception {
|
||||
String path = "/a/b/c";
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer();
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
|
||||
t.setReader(new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"/a/b/c", "a/b/c", "b/c", "c"},
|
||||
|
@ -42,7 +45,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testEndOfDelimiterReverse() throws Exception {
|
||||
String path = "/a/b/c/";
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer();
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
|
||||
t.setReader(new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"/a/b/c/", "a/b/c/", "b/c/", "c/"},
|
||||
|
@ -54,7 +57,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testStartOfCharReverse() throws Exception {
|
||||
String path = "a/b/c";
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer();
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
|
||||
t.setReader(new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"a/b/c", "b/c", "c"},
|
||||
|
@ -66,7 +69,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testStartOfCharEndOfDelimiterReverse() throws Exception {
|
||||
String path = "a/b/c/";
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer();
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
|
||||
t.setReader(new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"a/b/c/", "b/c/", "c/"},
|
||||
|
@ -78,7 +81,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testOnlyDelimiterReverse() throws Exception {
|
||||
String path = "/";
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer();
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
|
||||
t.setReader(new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"/"},
|
||||
|
@ -90,7 +93,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testOnlyDelimitersReverse() throws Exception {
|
||||
String path = "//";
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer();
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
|
||||
t.setReader(new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"//", "/"},
|
||||
|
@ -102,7 +105,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testEndOfDelimiterReverseSkip() throws Exception {
|
||||
String path = "/a/b/c/";
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( 1 );
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
|
||||
t.setReader(new StringReader(path));
|
||||
new StringReader(path);
|
||||
assertTokenStreamContents(t,
|
||||
|
@ -115,7 +118,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testStartOfCharReverseSkip() throws Exception {
|
||||
String path = "a/b/c";
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( 1 );
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
|
||||
t.setReader(new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"a/b/", "b/"},
|
||||
|
@ -127,7 +130,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testStartOfCharEndOfDelimiterReverseSkip() throws Exception {
|
||||
String path = "a/b/c/";
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( 1 );
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
|
||||
t.setReader(new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"a/b/", "b/"},
|
||||
|
@ -139,7 +142,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testOnlyDelimiterReverseSkip() throws Exception {
|
||||
String path = "/";
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( 1 );
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
|
||||
t.setReader(new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{},
|
||||
|
@ -151,7 +154,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testOnlyDelimitersReverseSkip() throws Exception {
|
||||
String path = "//";
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( 1 );
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
|
||||
t.setReader(new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"/"},
|
||||
|
@ -163,7 +166,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testReverseSkip2() throws Exception {
|
||||
String path = "/a/b/c/";
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( 2 );
|
||||
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 2);
|
||||
t.setReader( new StringReader(path));
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"/a/", "a/"},
|
||||
|
@ -178,11 +181,12 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new ReversePathHierarchyTokenizer();
|
||||
Tokenizer tokenizer = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
// TODO: properly support positionLengthAttribute
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
|
||||
}
|
||||
|
||||
/** blast some random large strings through the analyzer */
|
||||
|
@ -191,10 +195,11 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
|||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new ReversePathHierarchyTokenizer();
|
||||
Tokenizer tokenizer = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 1027);
|
||||
// TODO: properly support positionLengthAttribute
|
||||
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 1027, false, false);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -53,7 +53,7 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase
|
|||
};
|
||||
|
||||
for( String[] test : tests ) {
|
||||
TokenStream stream = new PatternTokenizer(Pattern.compile(test[1]), Integer.parseInt(test[0]));
|
||||
TokenStream stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile(test[1]), Integer.parseInt(test[0]));
|
||||
((Tokenizer)stream).setReader(new StringReader(test[2]));
|
||||
String out = tsToString( stream );
|
||||
// System.out.println( test[2] + " ==> " + out );
|
||||
|
@ -86,7 +86,7 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase
|
|||
CharFilter charStream = new MappingCharFilter( normMap, new StringReader( INPUT ) );
|
||||
|
||||
// create PatternTokenizer
|
||||
Tokenizer stream = new PatternTokenizer(Pattern.compile("[,;/\\s]+"), -1);
|
||||
Tokenizer stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("[,;/\\s]+"), -1);
|
||||
stream.setReader(charStream);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "Günther", "Günther", "is", "here" },
|
||||
|
@ -95,7 +95,7 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase
|
|||
INPUT.length());
|
||||
|
||||
charStream = new MappingCharFilter( normMap, new StringReader( INPUT ) );
|
||||
stream = new PatternTokenizer(Pattern.compile("Günther"), 0);
|
||||
stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("Günther"), 0);
|
||||
stream.setReader(charStream);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "Günther", "Günther" },
|
||||
|
@ -132,7 +132,7 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase
|
|||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new PatternTokenizer(Pattern.compile("a"), -1);
|
||||
Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), -1);
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
};
|
||||
|
@ -141,7 +141,7 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase
|
|||
Analyzer b = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new PatternTokenizer(Pattern.compile("a"), 0);
|
||||
Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), 0);
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -28,7 +28,7 @@ public class TestPatternTokenizerFactory extends BaseTokenStreamFactoryTestCase
|
|||
public void testFactory() throws Exception {
|
||||
final Reader reader = new StringReader("Günther Günther is here");
|
||||
// create PatternTokenizer
|
||||
Tokenizer stream = tokenizerFactory("Pattern", "pattern", "[,;/\\s]+").create();
|
||||
Tokenizer stream = tokenizerFactory("Pattern", "pattern", "[,;/\\s]+").create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "Günther", "Günther", "is", "here" });
|
||||
|
|
|
@ -18,18 +18,14 @@ package org.apache.lucene.analysis.pt;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
@ -41,9 +37,8 @@ public class TestPortugueseLightStemFilter extends BaseTokenStreamTestCase {
|
|||
private Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT);
|
||||
TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
|
||||
return new TokenStreamComponents(source, new PortugueseLightStemFilter(result));
|
||||
Tokenizer source = new MockTokenizer(MockTokenizer.SIMPLE, true);
|
||||
return new TokenStreamComponents(source, new PortugueseLightStemFilter(source));
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -18,18 +18,14 @@ package org.apache.lucene.analysis.pt;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
@ -41,9 +37,8 @@ public class TestPortugueseMinimalStemFilter extends BaseTokenStreamTestCase {
|
|||
private Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT);
|
||||
TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
|
||||
return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(result));
|
||||
Tokenizer source = new MockTokenizer(MockTokenizer.SIMPLE, true);
|
||||
return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(source));
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.pt;
|
|||
import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
@ -28,9 +27,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/**
|
||||
|
@ -40,9 +37,8 @@ public class TestPortugueseStemFilter extends BaseTokenStreamTestCase {
|
|||
private Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT);
|
||||
TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
|
||||
return new TokenStreamComponents(source, new PortugueseStemFilter(result));
|
||||
Tokenizer source = new MockTokenizer(MockTokenizer.SIMPLE, true);
|
||||
return new TokenStreamComponents(source, new PortugueseStemFilter(source));
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -1096,7 +1096,8 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
private static Token createToken
|
||||
(String term, int start, int offset, int positionIncrement)
|
||||
{
|
||||
Token token = new Token(start, offset);
|
||||
Token token = new Token();
|
||||
token.setOffset(start, offset);
|
||||
token.copyBuffer(term.toCharArray(), 0, term.length());
|
||||
token.setPositionIncrement(positionIncrement);
|
||||
return token;
|
||||
|
|
|
@ -151,7 +151,9 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
|
|||
dogDetector.addAttribute(CheckClearAttributesAttribute.class);
|
||||
theDetector.addAttribute(CheckClearAttributesAttribute.class);
|
||||
|
||||
final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(whitespaceMockTokenizer(buffer2.toString()));
|
||||
MockTokenizer tokenizer = new MockTokenizer(tee1.getAttributeFactory(), MockTokenizer.WHITESPACE, false);
|
||||
tokenizer.setReader(new StringReader(buffer2.toString()));
|
||||
final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(tokenizer);
|
||||
tee2.addSinkTokenStream(dogDetector);
|
||||
tee2.addSinkTokenStream(theDetector);
|
||||
final TokenStream source2 = tee2;
|
||||
|
|
|
@ -34,7 +34,7 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
|
|||
*/
|
||||
public void testStandardTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("Wha\u0301t's this thing do?");
|
||||
Tokenizer stream = tokenizerFactory("Standard").create();
|
||||
Tokenizer stream = tokenizerFactory("Standard").create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[]{"Wha\u0301t's", "this", "thing", "do"});
|
||||
|
@ -49,7 +49,7 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
|
|||
String content = "one two three " + longWord + " four five six";
|
||||
Reader reader = new StringReader(content);
|
||||
Tokenizer stream = tokenizerFactory("Standard",
|
||||
"maxTokenLength", "1000").create();
|
||||
"maxTokenLength", "1000").create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[]{"one", "two", "three", longWord, "four", "five", "six"});
|
||||
|
@ -60,7 +60,7 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
|
|||
*/
|
||||
public void testClassicTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("What's this thing do?");
|
||||
Tokenizer stream = tokenizerFactory("Classic").create();
|
||||
Tokenizer stream = tokenizerFactory("Classic").create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[]{"What's", "this", "thing", "do"});
|
||||
|
@ -75,7 +75,7 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
|
|||
String content = "one two three " + longWord + " four five six";
|
||||
Reader reader = new StringReader(content);
|
||||
Tokenizer stream = tokenizerFactory("Classic",
|
||||
"maxTokenLength", "1000").create();
|
||||
"maxTokenLength", "1000").create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[]{"one", "two", "three", longWord, "four", "five", "six"});
|
||||
|
@ -86,7 +86,7 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
|
|||
*/
|
||||
public void testStandardFilter() throws Exception {
|
||||
Reader reader = new StringReader("What's this thing do?");
|
||||
Tokenizer tokenizer = tokenizerFactory("Classic").create();
|
||||
Tokenizer tokenizer = tokenizerFactory("Classic").create(newAttributeFactory());
|
||||
tokenizer.setReader(reader);
|
||||
TokenStream stream = tokenFilterFactory("Classic").create(tokenizer);
|
||||
assertTokenStreamContents(stream,
|
||||
|
@ -109,7 +109,7 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
|
|||
*/
|
||||
public void testWhitespaceTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("What's this thing do?");
|
||||
Tokenizer stream = tokenizerFactory("Whitespace").create();
|
||||
Tokenizer stream = tokenizerFactory("Whitespace").create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "What's", "this", "thing", "do?" });
|
||||
|
@ -120,7 +120,7 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
|
|||
*/
|
||||
public void testLetterTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("What's this thing do?");
|
||||
Tokenizer stream = tokenizerFactory("Letter").create();
|
||||
Tokenizer stream = tokenizerFactory("Letter").create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "What", "s", "this", "thing", "do" });
|
||||
|
@ -131,7 +131,7 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
|
|||
*/
|
||||
public void testLowerCaseTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("What's this thing do?");
|
||||
Tokenizer stream = tokenizerFactory("LowerCase").create();
|
||||
Tokenizer stream = tokenizerFactory("LowerCase").create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "what", "s", "this", "thing", "do" });
|
||||
|
|
|
@ -31,7 +31,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
|
|||
|
||||
public void testUAX29URLEmailTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("Wha\u0301t's this thing do?");
|
||||
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create();
|
||||
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "Wha\u0301t's", "this", "thing", "do" });
|
||||
|
@ -39,7 +39,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
|
|||
|
||||
public void testArabic() throws Exception {
|
||||
Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.");
|
||||
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create();
|
||||
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
|
||||
|
@ -48,7 +48,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
|
|||
|
||||
public void testChinese() throws Exception {
|
||||
Reader reader = new StringReader("我是中国人。 1234 Tests ");
|
||||
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create();
|
||||
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "我", "是", "中", "国", "人", "1234", "Tests" });
|
||||
|
@ -56,7 +56,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
|
|||
|
||||
public void testKorean() throws Exception {
|
||||
Reader reader = new StringReader("안녕하세요 한글입니다");
|
||||
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create();
|
||||
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "안녕하세요", "한글입니다" });
|
||||
|
@ -64,7 +64,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
|
|||
|
||||
public void testHyphen() throws Exception {
|
||||
Reader reader = new StringReader("some-dashed-phrase");
|
||||
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create();
|
||||
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "some", "dashed", "phrase" });
|
||||
|
@ -87,7 +87,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
|
|||
+ " blah Sirrah woof "
|
||||
+ "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4\n";
|
||||
Reader reader = new StringReader(textWithURLs);
|
||||
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create();
|
||||
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {
|
||||
|
@ -126,7 +126,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
|
|||
+ "lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae\n"
|
||||
+ "lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H\n";
|
||||
Reader reader = new StringReader(textWithEmails);
|
||||
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create();
|
||||
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {
|
||||
|
@ -157,7 +157,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
|
|||
String content = "one two three " + longWord + " four five six";
|
||||
Reader reader = new StringReader(content);
|
||||
Tokenizer stream = tokenizerFactory("UAX29URLEmail",
|
||||
"maxTokenLength", "1000").create();
|
||||
"maxTokenLength", "1000").create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"one", "two", "three", longWord, "four", "five", "six" });
|
||||
|
|
|
@ -31,7 +31,7 @@ public class TestThaiTokenizerFactory extends BaseTokenStreamFactoryTestCase {
|
|||
*/
|
||||
public void testWordBreak() throws Exception {
|
||||
assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiTokenizer.DBBI_AVAILABLE);
|
||||
Tokenizer tokenizer = tokenizerFactory("Thai").create();
|
||||
Tokenizer tokenizer = tokenizerFactory("Thai").create(newAttributeFactory());
|
||||
tokenizer.setReader(new StringReader("การที่ได้ต้องแสดงว่างานดี"));
|
||||
assertTokenStreamContents(tokenizer, new String[] {"การ", "ที่", "ได้",
|
||||
"ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
|
||||
|
|
|
@ -52,7 +52,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
|||
}
|
||||
// internal buffer size is 1024 make sure we have a surrogate pair right at the border
|
||||
builder.insert(1023, "\ud801\udc1c");
|
||||
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT);
|
||||
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
|
||||
tokenizer.setReader(new StringReader(builder.toString()));
|
||||
assertTokenStreamContents(tokenizer, builder.toString().toLowerCase(Locale.ROOT).split(" "));
|
||||
}
|
||||
|
@ -70,7 +70,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
|||
builder.append("a");
|
||||
}
|
||||
builder.append("\ud801\udc1cabc");
|
||||
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT);
|
||||
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
|
||||
tokenizer.setReader(new StringReader(builder.toString()));
|
||||
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT)});
|
||||
}
|
||||
|
@ -85,7 +85,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
|||
for (int i = 0; i < 255; i++) {
|
||||
builder.append("A");
|
||||
}
|
||||
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT);
|
||||
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
|
||||
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
|
||||
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
|
||||
}
|
||||
|
@ -100,7 +100,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
|||
builder.append("A");
|
||||
}
|
||||
builder.append("\ud801\udc1c");
|
||||
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT);
|
||||
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
|
||||
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
|
||||
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
|
||||
}
|
||||
|
@ -110,7 +110,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
|||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT) {
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, newAttributeFactory()) {
|
||||
@Override
|
||||
protected int normalize(int c) {
|
||||
if (c > 0xffff) {
|
||||
|
@ -148,7 +148,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
|||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT) {
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, newAttributeFactory()) {
|
||||
@Override
|
||||
protected int normalize(int c) {
|
||||
if (c <= 0xffff) {
|
||||
|
|
|
@ -40,7 +40,7 @@ public class TestElision extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testElision() throws Exception {
|
||||
String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
|
||||
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT);
|
||||
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
|
||||
tokenizer.setReader(new StringReader(test));
|
||||
CharArraySet articles = new CharArraySet(TEST_VERSION_CURRENT, asSet("l", "M"), false);
|
||||
TokenFilter filter = new ElisionFilter(tokenizer, articles);
|
||||
|
|
|
@ -140,7 +140,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
|
|||
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
public WholeSentenceTokenizer() {
|
||||
super(BreakIterator.getSentenceInstance(Locale.ROOT));
|
||||
super(newAttributeFactory(), BreakIterator.getSentenceInstance(Locale.ROOT));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -178,7 +178,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
|
|||
private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
public SentenceAndWordTokenizer() {
|
||||
super(BreakIterator.getSentenceInstance(Locale.ROOT));
|
||||
super(newAttributeFactory(), BreakIterator.getSentenceInstance(Locale.ROOT));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
|
|||
public class TestWikipediaTokenizerFactory extends BaseTokenStreamFactoryTestCase {
|
||||
public void testTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("This is a [[Category:foo]]");
|
||||
Tokenizer tokenizer = tokenizerFactory("Wikipedia").create();
|
||||
Tokenizer tokenizer = tokenizerFactory("Wikipedia").create(newAttributeFactory());
|
||||
tokenizer.setReader(reader);
|
||||
assertTokenStreamContents(tokenizer,
|
||||
new String[] { "This", "is", "a", "foo" },
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.wikipedia;
|
|||
|
||||
import java.io.StringReader;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
|
@ -39,7 +40,7 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testSimple() throws Exception {
|
||||
String text = "This is a [[Category:foo]]";
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer();
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
|
||||
tf.setReader(new StringReader(text));
|
||||
assertTokenStreamContents(tf,
|
||||
new String[] { "This", "is", "a", "foo" },
|
||||
|
@ -62,7 +63,7 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
|
|||
+ " [http://foo.boo.com/test/test/ Test Test] [http://foo.boo.com/test/test/test.html Test Test]"
|
||||
+ " [http://foo.boo.com/test/test/test.html?g=b&c=d Test Test] <ref>Citation</ref> <sup>martian</sup> <span class=\"glue\">code</span>";
|
||||
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer();
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
|
||||
tf.setReader(new StringReader(test));
|
||||
assertTokenStreamContents(tf,
|
||||
new String[] {"link", "This", "is", "a",
|
||||
|
@ -104,7 +105,7 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testLinkPhrases() throws Exception {
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer();
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
|
||||
tf.setReader(new StringReader(LINK_PHRASES));
|
||||
checkLinkPhrases(tf);
|
||||
}
|
||||
|
@ -118,7 +119,7 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testLinks() throws Exception {
|
||||
String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]";
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer();
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
|
||||
tf.setReader(new StringReader(test));
|
||||
assertTokenStreamContents(tf,
|
||||
new String[] { "http://lucene.apache.org/java/docs/index.html#news", "here",
|
||||
|
@ -134,7 +135,7 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
|
|||
untoks.add(WikipediaTokenizer.CATEGORY);
|
||||
untoks.add(WikipediaTokenizer.ITALICS);
|
||||
//should be exactly the same, regardless of untoks
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer(WikipediaTokenizer.TOKENS_ONLY, untoks);
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, untoks);
|
||||
tf.setReader(new StringReader(LINK_PHRASES));
|
||||
checkLinkPhrases(tf);
|
||||
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
|
||||
|
@ -155,7 +156,7 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
|
|||
untoks.add(WikipediaTokenizer.ITALICS);
|
||||
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
|
||||
//should output all the indivual tokens plus the untokenized tokens as well. Untokenized tokens
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer(WikipediaTokenizer.BOTH, untoks);
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.BOTH, untoks);
|
||||
tf.setReader(new StringReader(test));
|
||||
assertTokenStreamContents(tf,
|
||||
new String[] { "a b c d", "a", "b", "c", "d", "e f g", "e", "f", "g",
|
||||
|
@ -167,7 +168,7 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
|
|||
);
|
||||
|
||||
// now check the flags, TODO: add way to check flags from BaseTokenStreamTestCase?
|
||||
tf = new WikipediaTokenizer(WikipediaTokenizer.BOTH, untoks);
|
||||
tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.BOTH, untoks);
|
||||
tf.setReader(new StringReader(test));
|
||||
int expectedFlags[] = new int[] { UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0,
|
||||
0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0 };
|
||||
|
@ -187,11 +188,12 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
|
|||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new WikipediaTokenizer();
|
||||
Tokenizer tokenizer = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
// TODO: properly support positionLengthAttribute
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
|
||||
}
|
||||
|
||||
/** blast some random large strings through the analyzer */
|
||||
|
@ -201,10 +203,11 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
|
|||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new WikipediaTokenizer();
|
||||
Tokenizer tokenizer = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192);
|
||||
// TODO: properly support positionLengthAttribute
|
||||
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192, false, false);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -77,7 +77,7 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
|
|||
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
|
||||
Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
|
||||
|
||||
Tokenizer tokenStream = new NGramTokenizer(TEST_VERSION_CURRENT, 1, 1);
|
||||
Tokenizer tokenStream = new NGramTokenizer(TEST_VERSION_CURRENT, newAttributeFactory(), 1, 1);
|
||||
tokenStream.setReader(reader);
|
||||
|
||||
assertTokenStreamContents(tokenStream,
|
||||
|
|
|
@ -42,7 +42,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
sb.append(whitespace);
|
||||
sb.append("testing 1234");
|
||||
String input = sb.toString();
|
||||
ICUTokenizer tokenizer = new ICUTokenizer(new DefaultICUTokenizerConfig(false));
|
||||
ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
|
||||
tokenizer.setReader(new StringReader(input));
|
||||
assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
|
||||
}
|
||||
|
@ -53,7 +53,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
sb.append('a');
|
||||
}
|
||||
String input = sb.toString();
|
||||
ICUTokenizer tokenizer = new ICUTokenizer(new DefaultICUTokenizerConfig(false));
|
||||
ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
|
||||
tokenizer.setReader(new StringReader(input));
|
||||
char token[] = new char[4096];
|
||||
Arrays.fill(token, 'a');
|
||||
|
@ -70,7 +70,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
private Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new ICUTokenizer(new DefaultICUTokenizerConfig(false));
|
||||
Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
|
||||
TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
|
|
|
@ -30,7 +30,7 @@ public class TestICUTokenizerCJK extends BaseTokenStreamTestCase {
|
|||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
return new TokenStreamComponents(new ICUTokenizer());
|
||||
return new TokenStreamComponents(new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(true)));
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ public class TestICUTokenizerFactory extends BaseTokenStreamTestCase {
|
|||
Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี This is a test ກວ່າດອກ");
|
||||
ICUTokenizerFactory factory = new ICUTokenizerFactory(new HashMap<String,String>());
|
||||
factory.inform(new ClasspathResourceLoader(getClass()));
|
||||
Tokenizer stream = factory.create();
|
||||
Tokenizer stream = factory.create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี",
|
||||
|
@ -48,7 +48,7 @@ public class TestICUTokenizerFactory extends BaseTokenStreamTestCase {
|
|||
args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-break-only-on-whitespace.rbbi");
|
||||
ICUTokenizerFactory factory = new ICUTokenizerFactory(args);
|
||||
factory.inform(new ClasspathResourceLoader(this.getClass()));
|
||||
Tokenizer stream = factory.create();
|
||||
Tokenizer stream = factory.create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "Don't,break.at?/(punct)!", "\u201Cnice\u201D", "85_At:all;", "`really\"", "+2=3$5,&813", "!@#%$^)(*@#$" },
|
||||
|
@ -62,7 +62,7 @@ public class TestICUTokenizerFactory extends BaseTokenStreamTestCase {
|
|||
args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-dont-break-on-hyphens.rbbi");
|
||||
ICUTokenizerFactory factory = new ICUTokenizerFactory(args);
|
||||
factory.inform(new ClasspathResourceLoader(getClass()));
|
||||
Tokenizer stream = factory.create();
|
||||
Tokenizer stream = factory.create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "One-two", "punch",
|
||||
|
@ -82,7 +82,7 @@ public class TestICUTokenizerFactory extends BaseTokenStreamTestCase {
|
|||
args.put(ICUTokenizerFactory.RULEFILES, "Cyrl:KeywordTokenizer.rbbi,Thai:KeywordTokenizer.rbbi");
|
||||
ICUTokenizerFactory factory = new ICUTokenizerFactory(args);
|
||||
factory.inform(new ClasspathResourceLoader(getClass()));
|
||||
Tokenizer stream = factory.create();
|
||||
Tokenizer stream = factory.create(newAttributeFactory());
|
||||
stream.setReader(reader);
|
||||
assertTokenStreamContents(stream, new String[] { "Some", "English",
|
||||
"Немного русский. ",
|
||||
|
|
|
@ -41,7 +41,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
|
|||
private Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer source = new ICUTokenizer(new DefaultICUTokenizerConfig(false));
|
||||
Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
|
||||
TokenStream result = new CJKBigramFilter(source);
|
||||
return new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET));
|
||||
}
|
||||
|
@ -56,7 +56,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
|
|||
private Analyzer analyzer2 = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer source = new ICUTokenizer(new DefaultICUTokenizerConfig(false));
|
||||
Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
|
||||
// we put this before the CJKBigramFilter, because the normalization might combine
|
||||
// some halfwidth katakana forms, which will affect the bigramming.
|
||||
TokenStream result = new ICUNormalizer2Filter(source);
|
||||
|
|
|
@ -36,7 +36,7 @@ public class TestExtendedMode extends BaseTokenStreamTestCase {
|
|||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(null, true, Mode.EXTENDED);
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, true, Mode.EXTENDED);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -32,7 +32,7 @@ public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
|
|||
private Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.DEFAULT_MODE);
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, true, JapaneseTokenizer.DEFAULT_MODE);
|
||||
return new TokenStreamComponents(tokenizer, new JapaneseBaseFormFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
|
@ -48,7 +48,7 @@ public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
|
|||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer source = new JapaneseTokenizer(null, true, JapaneseTokenizer.DEFAULT_MODE);
|
||||
Tokenizer source = new JapaneseTokenizer(newAttributeFactory(), null, true, JapaneseTokenizer.DEFAULT_MODE);
|
||||
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new JapaneseBaseFormFilter(sink));
|
||||
}
|
||||
|
|
|
@ -32,7 +32,7 @@ public class TestJapaneseBaseFormFilterFactory extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws IOException {
|
||||
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>());
|
||||
tokenizerFactory.inform(new StringMockResourceLoader(""));
|
||||
TokenStream ts = tokenizerFactory.create();
|
||||
TokenStream ts = tokenizerFactory.create(newAttributeFactory());
|
||||
((Tokenizer)ts).setReader(new StringReader("それはまだ実験段階にあります"));
|
||||
JapaneseBaseFormFilterFactory factory = new JapaneseBaseFormFilterFactory(new HashMap<String,String>());
|
||||
ts = factory.create(ts);
|
||||
|
|
|
@ -45,7 +45,7 @@ public class TestJapaneseIterationMarkCharFilter extends BaseTokenStreamTestCase
|
|||
private Analyzer japaneseAnalyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(null, false, JapaneseTokenizer.Mode.SEARCH);
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
|
||||
|
|
|
@ -50,7 +50,7 @@ public class TestJapaneseIterationMarkCharFilterFactory extends BaseTokenStreamT
|
|||
CharFilter filter = filterFactory.create(
|
||||
new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
|
||||
);
|
||||
TokenStream tokenStream = tokenizerFactory.create();
|
||||
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
|
||||
((Tokenizer)tokenStream).setReader(filter);
|
||||
assertTokenStreamContents(tokenStream, new String[]{"時時", "馬鹿馬鹿しい", "ところどころ", "ミ", "スズ"});
|
||||
}
|
||||
|
@ -67,7 +67,7 @@ public class TestJapaneseIterationMarkCharFilterFactory extends BaseTokenStreamT
|
|||
CharFilter filter = filterFactory.create(
|
||||
new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
|
||||
);
|
||||
TokenStream tokenStream = tokenizerFactory.create();
|
||||
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
|
||||
((Tokenizer)tokenStream).setReader(filter);
|
||||
assertTokenStreamContents(tokenStream, new String[]{"時時", "馬鹿馬鹿しい", "ところ", "ゞ", "ゝ", "ゝ", "ミス", "ヾ"});
|
||||
}
|
||||
|
@ -84,7 +84,7 @@ public class TestJapaneseIterationMarkCharFilterFactory extends BaseTokenStreamT
|
|||
CharFilter filter = filterFactory.create(
|
||||
new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
|
||||
);
|
||||
TokenStream tokenStream = tokenizerFactory.create();
|
||||
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
|
||||
((Tokenizer)tokenStream).setReader(filter);
|
||||
assertTokenStreamContents(tokenStream, new String[]{"時々", "馬鹿", "々", "々", "しい", "ところどころ", "ミ", "スズ"});
|
||||
}
|
||||
|
|
|
@ -32,7 +32,7 @@ public class TestJapaneseKatakanaStemFilterFactory extends BaseTokenStreamTestCa
|
|||
public void testKatakanaStemming() throws IOException {
|
||||
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>());
|
||||
tokenizerFactory.inform(new StringMockResourceLoader(""));
|
||||
TokenStream tokenStream = tokenizerFactory.create();
|
||||
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
|
||||
((Tokenizer)tokenStream).setReader(new StringReader("明後日パーティーに行く予定がある。図書館で資料をコピーしました。"));
|
||||
JapaneseKatakanaStemFilterFactory filterFactory = new JapaneseKatakanaStemFilterFactory(new HashMap<String,String>());;
|
||||
assertTokenStreamContents(filterFactory.create(tokenStream),
|
||||
|
|
|
@ -35,7 +35,7 @@ public class TestJapaneseReadingFormFilter extends BaseTokenStreamTestCase {
|
|||
private Analyzer katakanaAnalyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, true, JapaneseTokenizer.Mode.SEARCH);
|
||||
return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(tokenizer, false));
|
||||
}
|
||||
};
|
||||
|
@ -43,7 +43,7 @@ public class TestJapaneseReadingFormFilter extends BaseTokenStreamTestCase {
|
|||
private Analyzer romajiAnalyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, true, JapaneseTokenizer.Mode.SEARCH);
|
||||
return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(tokenizer, true));
|
||||
}
|
||||
};
|
||||
|
@ -59,7 +59,7 @@ public class TestJapaneseReadingFormFilter extends BaseTokenStreamTestCase {
|
|||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, true, JapaneseTokenizer.Mode.SEARCH);
|
||||
TokenStream stream = new CJKWidthFilter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, false));
|
||||
}
|
||||
|
@ -79,7 +79,7 @@ public class TestJapaneseReadingFormFilter extends BaseTokenStreamTestCase {
|
|||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, true, JapaneseTokenizer.Mode.SEARCH);
|
||||
TokenStream stream = new CJKWidthFilter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, true));
|
||||
}
|
||||
|
|
|
@ -62,7 +62,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
private Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(readDict(), false, Mode.SEARCH);
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), readDict(), false, Mode.SEARCH);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
@ -70,7 +70,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
private Analyzer analyzerNormal = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(readDict(), false, Mode.NORMAL);
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), readDict(), false, Mode.NORMAL);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
@ -78,7 +78,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
private Analyzer analyzerNoPunct = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(readDict(), true, Mode.SEARCH);
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), readDict(), true, Mode.SEARCH);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
@ -86,7 +86,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
private Analyzer extendedModeAnalyzerNoPunct = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(readDict(), true, Mode.EXTENDED);
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), readDict(), true, Mode.EXTENDED);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
@ -202,7 +202,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(readDict(), false, Mode.SEARCH);
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), readDict(), false, Mode.SEARCH);
|
||||
TokenStream graph = new MockGraphTokenFilter(random(), tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, graph);
|
||||
}
|
||||
|
@ -352,7 +352,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
final Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
JapaneseTokenizer tokenizer = new JapaneseTokenizer(readDict(), false, Mode.SEARCH);
|
||||
JapaneseTokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), readDict(), false, Mode.SEARCH);
|
||||
tokenizer.setGraphvizFormatter(gv2);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
|
|
|
@ -34,7 +34,7 @@ public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase {
|
|||
public void testSimple() throws IOException {
|
||||
JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(new HashMap<String,String>());
|
||||
factory.inform(new StringMockResourceLoader(""));
|
||||
TokenStream ts = factory.create();
|
||||
TokenStream ts = factory.create(newAttributeFactory());
|
||||
((Tokenizer)ts).setReader(new StringReader("これは本ではない"));
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "これ", "は", "本", "で", "は", "ない" },
|
||||
|
@ -49,7 +49,7 @@ public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase {
|
|||
public void testDefaults() throws IOException {
|
||||
JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(new HashMap<String,String>());
|
||||
factory.inform(new StringMockResourceLoader(""));
|
||||
TokenStream ts = factory.create();
|
||||
TokenStream ts = factory.create(newAttributeFactory());
|
||||
((Tokenizer)ts).setReader(new StringReader("シニアソフトウェアエンジニア"));
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "シニア", "シニアソフトウェアエンジニア", "ソフトウェア", "エンジニア" }
|
||||
|
@ -64,7 +64,7 @@ public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase {
|
|||
args.put("mode", "normal");
|
||||
JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args);
|
||||
factory.inform(new StringMockResourceLoader(""));
|
||||
TokenStream ts = factory.create();
|
||||
TokenStream ts = factory.create(newAttributeFactory());
|
||||
((Tokenizer)ts).setReader(new StringReader("シニアソフトウェアエンジニア"));
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "シニアソフトウェアエンジニア" }
|
||||
|
@ -85,7 +85,7 @@ public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase {
|
|||
args.put("userDictionary", "userdict.txt");
|
||||
JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args);
|
||||
factory.inform(new StringMockResourceLoader(userDict));
|
||||
TokenStream ts = factory.create();
|
||||
TokenStream ts = factory.create(newAttributeFactory());
|
||||
((Tokenizer)ts).setReader(new StringReader("関西国際空港に行った"));
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "関西", "国際", "空港", "に", "行っ", "た" }
|
||||
|
@ -100,7 +100,7 @@ public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase {
|
|||
args.put("discardPunctuation", "false");
|
||||
JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args);
|
||||
factory.inform(new StringMockResourceLoader(""));
|
||||
TokenStream ts = factory.create();
|
||||
TokenStream ts = factory.create(newAttributeFactory());
|
||||
((Tokenizer)ts).setReader(new StringReader("今ノルウェーにいますが、来週の頭日本に戻ります。楽しみにしています!お寿司が食べたいな。。。"));
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "今", "ノルウェー", "に", "い", "ます", "が", "、",
|
||||
|
|
|
@ -34,7 +34,7 @@ public class TestSearchMode extends BaseTokenStreamTestCase {
|
|||
private final Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(null, true, Mode.SEARCH);
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, true, Mode.SEARCH);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
package org.apache.lucene.analysis.phonetic;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
@ -25,54 +24,47 @@ import org.apache.lucene.analysis.MockTokenizer;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
public class DoubleMetaphoneFilterTest extends BaseTokenStreamTestCase {
|
||||
|
||||
private TokenStream whitespaceTokenizer(String data) throws IOException {
|
||||
WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT);
|
||||
whitespaceTokenizer.setReader(new StringReader(data));
|
||||
return whitespaceTokenizer;
|
||||
}
|
||||
|
||||
public void testSize4FalseInject() throws Exception {
|
||||
TokenStream stream = whitespaceTokenizer("international");
|
||||
TokenStream stream = whitespaceMockTokenizer("international");
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
||||
assertTokenStreamContents(filter, new String[] { "ANTR" });
|
||||
}
|
||||
|
||||
public void testSize4TrueInject() throws Exception {
|
||||
TokenStream stream = whitespaceTokenizer("international");
|
||||
TokenStream stream = whitespaceMockTokenizer("international");
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
|
||||
assertTokenStreamContents(filter, new String[] { "international", "ANTR" });
|
||||
}
|
||||
|
||||
public void testAlternateInjectFalse() throws Exception {
|
||||
TokenStream stream = whitespaceTokenizer("Kuczewski");
|
||||
TokenStream stream = whitespaceMockTokenizer("Kuczewski");
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
||||
assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" });
|
||||
}
|
||||
|
||||
public void testSize8FalseInject() throws Exception {
|
||||
TokenStream stream = whitespaceTokenizer("international");
|
||||
TokenStream stream = whitespaceMockTokenizer("international");
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||
assertTokenStreamContents(filter, new String[] { "ANTRNXNL" });
|
||||
}
|
||||
|
||||
public void testNonConvertableStringsWithInject() throws Exception {
|
||||
TokenStream stream = whitespaceTokenizer("12345 #$%@#^%&");
|
||||
TokenStream stream = whitespaceMockTokenizer("12345 #$%@#^%&");
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
|
||||
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
|
||||
}
|
||||
|
||||
public void testNonConvertableStringsWithoutInject() throws Exception {
|
||||
TokenStream stream = whitespaceTokenizer("12345 #$%@#^%&");
|
||||
TokenStream stream = whitespaceMockTokenizer("12345 #$%@#^%&");
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
|
||||
|
||||
// should have something after the stream
|
||||
stream = whitespaceTokenizer("12345 #$%@#^%& hello");
|
||||
stream = whitespaceMockTokenizer("12345 #$%@#^%& hello");
|
||||
filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
|
||||
}
|
||||
|
|
|
@ -110,7 +110,7 @@ public class TestBeiderMorseFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testCustomAttribute() throws IOException {
|
||||
TokenStream stream = new KeywordTokenizer();
|
||||
TokenStream stream = new MockTokenizer(MockTokenizer.KEYWORD, false);
|
||||
((Tokenizer)stream).setReader(new StringReader("D'Angelo"));
|
||||
stream = new PatternKeywordMarkerFilter(stream, Pattern.compile(".*"));
|
||||
stream = new BeiderMorseFilter(stream, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true));
|
||||
|
|
|
@ -34,7 +34,7 @@ public class TestHMMChineseTokenizerFactory extends BaseTokenStreamTestCase {
|
|||
public void testSimple() throws Exception {
|
||||
Reader reader = new StringReader("我购买了道具和服装。");
|
||||
TokenizerFactory factory = new HMMChineseTokenizerFactory(new HashMap<String,String>());
|
||||
Tokenizer tokenizer = factory.create();
|
||||
Tokenizer tokenizer = factory.create(newAttributeFactory());
|
||||
tokenizer.setReader(reader);
|
||||
// TODO: fix smart chinese to not emit punctuation tokens
|
||||
// at the moment: you have to clean up with WDF, or use the stoplist, etc
|
||||
|
|
|
@ -40,16 +40,9 @@ public class TestToken extends LuceneTestCase {
|
|||
assertEquals("word", t.type());
|
||||
assertEquals(0, t.getFlags());
|
||||
|
||||
t = new Token(6, 22);
|
||||
t.copyBuffer(content, 0, content.length);
|
||||
assertEquals("hello", t.toString());
|
||||
assertEquals("hello", t.toString());
|
||||
assertEquals(6, t.startOffset());
|
||||
assertEquals(22, t.endOffset());
|
||||
assertEquals("word", t.type());
|
||||
assertEquals(0, t.getFlags());
|
||||
|
||||
t = new Token(6, 22, 7);
|
||||
t = new Token();
|
||||
t.setOffset(6, 22);
|
||||
t.setFlags(7);
|
||||
t.copyBuffer(content, 0, content.length);
|
||||
assertEquals("hello", t.toString());
|
||||
assertEquals("hello", t.toString());
|
||||
|
@ -58,7 +51,9 @@ public class TestToken extends LuceneTestCase {
|
|||
assertEquals("word", t.type());
|
||||
assertEquals(7, t.getFlags());
|
||||
|
||||
t = new Token(6, 22, "junk");
|
||||
t = new Token();
|
||||
t.setOffset(6, 22);
|
||||
t.setType("junk");
|
||||
t.copyBuffer(content, 0, content.length);
|
||||
assertEquals("hello", t.toString());
|
||||
assertEquals("hello", t.toString());
|
||||
|
@ -174,7 +169,8 @@ public class TestToken extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testClone() throws Exception {
|
||||
Token t = new Token(0, 5);
|
||||
Token t = new Token();
|
||||
t.setOffset(0, 5);
|
||||
char[] content = "hello".toCharArray();
|
||||
t.copyBuffer(content, 0, 5);
|
||||
char[] buf = t.buffer();
|
||||
|
@ -195,7 +191,8 @@ public class TestToken extends LuceneTestCase {
|
|||
assertEquals("", t.toString());
|
||||
assertEquals("", copy.toString());
|
||||
|
||||
t = new Token(0, 5);
|
||||
t = new Token();
|
||||
t.setOffset(0, 5);
|
||||
char[] content = "hello".toCharArray();
|
||||
t.copyBuffer(content, 0, 5);
|
||||
char[] buf = t.buffer();
|
||||
|
@ -245,7 +242,8 @@ public class TestToken extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testAttributeReflection() throws Exception {
|
||||
Token t = new Token("foobar", 6, 22, 8);
|
||||
Token t = new Token("foobar", 6, 22);
|
||||
t.setFlags(8);
|
||||
TestUtil.assertAttributeReflection(t,
|
||||
new HashMap<String, Object>() {{
|
||||
put(CharTermAttribute.class.getName() + "#term", "foobar");
|
||||
|
|
|
@ -591,7 +591,6 @@ public class TestPayloads extends LuceneTestCase {
|
|||
Field field = new TextField("field", "", Field.Store.NO);
|
||||
TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
|
||||
((Tokenizer)ts).setReader(new StringReader("here we go"));
|
||||
assertFalse(ts.hasAttribute(PayloadAttribute.class));
|
||||
field.setTokenStream(ts);
|
||||
doc.add(field);
|
||||
writer.addDocument(doc);
|
||||
|
@ -603,7 +602,6 @@ public class TestPayloads extends LuceneTestCase {
|
|||
writer.addDocument(doc);
|
||||
ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
|
||||
((Tokenizer)ts).setReader(new StringReader("another"));
|
||||
assertFalse(ts.hasAttribute(PayloadAttribute.class));
|
||||
field.setTokenStream(ts);
|
||||
writer.addDocument(doc);
|
||||
DirectoryReader reader = writer.getReader();
|
||||
|
@ -625,7 +623,6 @@ public class TestPayloads extends LuceneTestCase {
|
|||
Field field = new TextField("field", "", Field.Store.NO);
|
||||
TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
|
||||
((Tokenizer)ts).setReader(new StringReader("here we go"));
|
||||
assertFalse(ts.hasAttribute(PayloadAttribute.class));
|
||||
field.setTokenStream(ts);
|
||||
doc.add(field);
|
||||
Field field2 = new TextField("field", "", Field.Store.NO);
|
||||
|
@ -638,8 +635,6 @@ public class TestPayloads extends LuceneTestCase {
|
|||
Field field3 = new TextField("field", "", Field.Store.NO);
|
||||
ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
|
||||
((Tokenizer)ts).setReader(new StringReader("nopayload"));
|
||||
|
||||
assertFalse(ts.hasAttribute(PayloadAttribute.class));
|
||||
field3.setTokenStream(ts);
|
||||
doc.add(field3);
|
||||
writer.addDocument(doc);
|
||||
|
|
|
@ -51,7 +51,6 @@ public class TestPayloadsOnVectors extends LuceneTestCase {
|
|||
Field field = new Field("field", "", customType);
|
||||
TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
|
||||
((Tokenizer)ts).setReader(new StringReader("here we go"));
|
||||
assertFalse(ts.hasAttribute(PayloadAttribute.class));
|
||||
field.setTokenStream(ts);
|
||||
doc.add(field);
|
||||
writer.addDocument(doc);
|
||||
|
@ -65,7 +64,6 @@ public class TestPayloadsOnVectors extends LuceneTestCase {
|
|||
|
||||
ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
|
||||
((Tokenizer)ts).setReader(new StringReader("another"));
|
||||
assertFalse(ts.hasAttribute(PayloadAttribute.class));
|
||||
field.setTokenStream(ts);
|
||||
writer.addDocument(doc);
|
||||
|
||||
|
@ -96,7 +94,6 @@ public class TestPayloadsOnVectors extends LuceneTestCase {
|
|||
Field field = new Field("field", "", customType);
|
||||
TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
|
||||
((Tokenizer)ts).setReader(new StringReader("here we go"));
|
||||
assertFalse(ts.hasAttribute(PayloadAttribute.class));
|
||||
field.setTokenStream(ts);
|
||||
doc.add(field);
|
||||
Field field2 = new Field("field", "", customType);
|
||||
|
@ -109,7 +106,6 @@ public class TestPayloadsOnVectors extends LuceneTestCase {
|
|||
Field field3 = new Field("field", "", customType);
|
||||
ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
|
||||
((Tokenizer)ts).setReader(new StringReader("nopayload"));
|
||||
assertFalse(ts.hasAttribute(PayloadAttribute.class));
|
||||
field3.setTokenStream(ts);
|
||||
doc.add(field3);
|
||||
writer.addDocument(doc);
|
||||
|
|
|
@ -67,7 +67,8 @@ public class TokenGroup {
|
|||
tot += score;
|
||||
}
|
||||
}
|
||||
Token token = new Token(termStartOffset, termEndOffset);
|
||||
Token token = new Token();
|
||||
token.setOffset(termStartOffset, termEndOffset);
|
||||
token.setEmpty().append(termAtt);
|
||||
tokens[numTokens] = token;
|
||||
scores[numTokens] = score;
|
||||
|
|
|
@ -314,10 +314,10 @@ public class HighlighterPhraseTest extends LuceneTestCase {
|
|||
public void reset() {
|
||||
this.i = -1;
|
||||
this.tokens = new Token[] {
|
||||
new Token(new char[] { 't', 'h', 'e' }, 0, 3, 0, 3),
|
||||
new Token(new char[] { 'f', 'o', 'x' }, 0, 3, 4, 7),
|
||||
new Token(new char[] { 'd', 'i', 'd' }, 0, 3, 8, 11),
|
||||
new Token(new char[] { 'j', 'u', 'm', 'p' }, 0, 4, 16, 20) };
|
||||
new Token("the", 0, 3),
|
||||
new Token("fox", 4, 7),
|
||||
new Token("did", 8, 11),
|
||||
new Token("jump", 16, 20) };
|
||||
this.tokens[3].setPositionIncrement(2);
|
||||
}
|
||||
}
|
||||
|
@ -354,10 +354,10 @@ public class HighlighterPhraseTest extends LuceneTestCase {
|
|||
public void reset() {
|
||||
this.i = -1;
|
||||
this.tokens = new Token[] {
|
||||
new Token(new char[] { 't', 'h', 'e' }, 0, 3, 0, 3),
|
||||
new Token(new char[] { 'f', 'o', 'x' }, 0, 3, 4, 7),
|
||||
new Token(new char[] { 'j', 'u', 'm', 'p' }, 0, 4, 8, 14),
|
||||
new Token(new char[] { 'j', 'u', 'm', 'p', 'e', 'd' }, 0, 6, 8, 14) };
|
||||
new Token("the", 0, 3),
|
||||
new Token("fox", 4, 7),
|
||||
new Token("jump", 8, 14),
|
||||
new Token("jumped", 8, 14) };
|
||||
this.tokens[3].setPositionIncrement(0);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2013,7 +2013,8 @@ final class SynonymTokenizer extends TokenStream {
|
|||
}
|
||||
st = new StringTokenizer(expansions, ",");
|
||||
if (st.hasMoreTokens()) {
|
||||
currentRealToken = new Token(realOffsetAtt.startOffset(), realOffsetAtt.endOffset());
|
||||
currentRealToken = new Token();
|
||||
currentRealToken.setOffset(realOffsetAtt.startOffset(), realOffsetAtt.endOffset());
|
||||
currentRealToken.copyBuffer(realTermAtt.buffer(), 0, realTermAtt.length());
|
||||
}
|
||||
|
||||
|
|
|
@ -78,12 +78,12 @@ public class TokenSourcesTest extends LuceneTestCase {
|
|||
public void reset() {
|
||||
this.i = -1;
|
||||
this.tokens = new Token[] {
|
||||
new Token(new char[] {'t', 'h', 'e'}, 0, 3, 0, 3),
|
||||
new Token(new char[] {'{', 'f', 'o', 'x', '}'}, 0, 5, 0, 7),
|
||||
new Token(new char[] {'f', 'o', 'x'}, 0, 3, 4, 7),
|
||||
new Token(new char[] {'d', 'i', 'd'}, 0, 3, 8, 11),
|
||||
new Token(new char[] {'n', 'o', 't'}, 0, 3, 12, 15),
|
||||
new Token(new char[] {'j', 'u', 'm', 'p'}, 0, 4, 16, 20)};
|
||||
new Token("the", 0, 3),
|
||||
new Token("{fox}", 0, 7),
|
||||
new Token("fox", 4, 7),
|
||||
new Token("did", 8, 11),
|
||||
new Token("not", 12, 15),
|
||||
new Token("jump", 16, 20)};
|
||||
this.tokens[1].setPositionIncrement(0);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,6 +39,7 @@ import org.apache.lucene.index.RandomIndexWriter;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LineFileDocs;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
@ -933,5 +934,18 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
mockTokenizer.setReader(new StringReader(input));
|
||||
return mockTokenizer;
|
||||
}
|
||||
|
||||
|
||||
/** Returns a new AttributeFactory impl */
|
||||
public static AttributeFactory newAttributeFactory(Random random) {
|
||||
if (random.nextBoolean()) {
|
||||
return Token.TOKEN_ATTRIBUTE_FACTORY;
|
||||
} else {
|
||||
return AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns a new AttributeFactory impl */
|
||||
public static AttributeFactory newAttributeFactory() {
|
||||
return newAttributeFactory(random());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -98,7 +98,7 @@ public class MockTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
public MockTokenizer(CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
|
||||
this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, runAutomaton, lowerCase, maxTokenLength);
|
||||
this(BaseTokenStreamTestCase.newAttributeFactory(), runAutomaton, lowerCase, maxTokenLength);
|
||||
}
|
||||
|
||||
public MockTokenizer(CharacterRunAutomaton runAutomaton, boolean lowerCase) {
|
||||
|
|
Loading…
Reference in New Issue