LUCENE-5642: Randomize attributeFactory in tests, use MockTokenizer more where possible, reduce use of esoteric Token ctors

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1592339 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-05-04 12:19:35 +00:00
parent d67884a07c
commit 94c513ad5b
60 changed files with 255 additions and 253 deletions

View File

@ -34,8 +34,7 @@ public class TestArabicFilters extends BaseTokenStreamFactoryTestCase {
*/
public void testNormalizer() throws Exception {
Reader reader = new StringReader("الذين مَلكت أيمانكم");
Tokenizer tokenizer = tokenizerFactory("Standard").create();
tokenizer.setReader(reader);
Tokenizer tokenizer = whitespaceMockTokenizer(reader);
TokenStream stream = tokenFilterFactory("ArabicNormalization").create(tokenizer);
assertTokenStreamContents(stream, new String[] {"الذين", "ملكت", "ايمانكم"});
}
@ -45,8 +44,7 @@ public class TestArabicFilters extends BaseTokenStreamFactoryTestCase {
*/
public void testStemmer() throws Exception {
Reader reader = new StringReader("الذين مَلكت أيمانكم");
Tokenizer tokenizer = tokenizerFactory("Standard").create();
tokenizer.setReader(reader);
Tokenizer tokenizer = whitespaceMockTokenizer(reader);
TokenStream stream = tokenFilterFactory("ArabicNormalization").create(tokenizer);
stream = tokenFilterFactory("ArabicStem").create(stream);
assertTokenStreamContents(stream, new String[] {"ذين", "ملكت", "ايمانكم"});
@ -57,8 +55,7 @@ public class TestArabicFilters extends BaseTokenStreamFactoryTestCase {
*/
public void testPersianCharFilter() throws Exception {
Reader reader = charFilterFactory("Persian").create(new StringReader("می‌خورد"));
Tokenizer tokenizer = tokenizerFactory("Standard").create();
tokenizer.setReader(reader);
Tokenizer tokenizer = whitespaceMockTokenizer(reader);
assertTokenStreamContents(tokenizer, new String[] { "می", "خورد" });
}

View File

@ -22,6 +22,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
@ -32,7 +33,7 @@ public class TestSoraniNormalizationFilter extends BaseTokenStreamTestCase {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
return new TokenStreamComponents(tokenizer, new SoraniNormalizationFilter(tokenizer));
}
};
@ -87,6 +88,13 @@ public class TestSoraniNormalizationFilter extends BaseTokenStreamTestCase {
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new SoraniNormalizationFilter(tokenizer));
}
};
checkOneTerm(a, "", "");
}
}

View File

@ -180,7 +180,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
CharArraySet dict = makeDictionary("ab", "cd", "ef");
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT);
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
tokenizer.setReader(new StringReader("abcdef"));
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
tokenizer,
@ -200,7 +200,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
public void testWordComponentWithLessThanMinimumLength() throws Exception {
CharArraySet dict = makeDictionary("abc", "d", "efg");
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT);
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
tokenizer.setReader(new StringReader("abcdefg"));
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
tokenizer,
@ -222,7 +222,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
CharArraySet dict = makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz",
"Aufgabe", "Überwachung");
Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT);
MockTokenizer wsTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
wsTokenizer.setEnableChecks(false); // we will reset in a strange place
wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
wsTokenizer, dict,
@ -246,7 +247,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
public void testRetainMockAttribute() throws Exception {
CharArraySet dict = makeDictionary("abc", "d", "efg");
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT);
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
tokenizer.setReader(new StringReader("abcdefg"));
TokenStream stream = new MockRetainAttributeFilter(tokenizer);
stream = new DictionaryCompoundWordTokenFilter(

View File

@ -22,6 +22,7 @@ import java.io.StringReader;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockReaderWrapper;
import org.apache.lucene.analysis.TokenStream;
@ -29,7 +30,6 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicOperations;
@ -44,7 +44,7 @@ import org.apache.lucene.util.automaton.Transition;
* Any tests here need to probably consider unicode version of the JRE (it could
* cause false fails).
*/
public class TestDuelingAnalyzers extends LuceneTestCase {
public class TestDuelingAnalyzers extends BaseTokenStreamTestCase {
private CharacterRunAutomaton jvmLetter;
@Override
@ -71,7 +71,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
Analyzer right = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT);
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
@ -91,7 +91,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
Analyzer right = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT);
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
@ -109,7 +109,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
Analyzer right = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT);
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
@ -128,7 +128,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
Analyzer right = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT);
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
@ -146,7 +146,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
Analyzer right = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT);
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
@ -165,7 +165,7 @@ public class TestDuelingAnalyzers extends LuceneTestCase {
Analyzer right = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT);
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
return new TokenStreamComponents(tokenizer, tokenizer);
}
};

View File

@ -179,7 +179,7 @@ public class TestFactories extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tf = tokenizer.create();
Tokenizer tf = tokenizer.create(newAttributeFactory());
if (tokenfilter != null) {
return new TokenStreamComponents(tf, tokenfilter.create(tf));
} else {

View File

@ -355,6 +355,11 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
return TEST_VERSION_CURRENT;
}
});
put(AttributeFactory.class, new ArgProducer() {
@Override public Object create(Random random) {
return newAttributeFactory(random);
}
});
put(Set.class, new ArgProducer() {
@Override public Object create(Random random) {
// TypeTokenFilter
@ -582,10 +587,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
Object[] args = new Object[paramTypes.length];
for (int i = 0; i < args.length; i++) {
Class<?> paramType = paramTypes[i];
if (paramType == AttributeFactory.class) {
// TODO: maybe the collator one...???
args[i] = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
} else if (paramType == AttributeSource.class) {
if (paramType == AttributeSource.class) {
// TODO: args[i] = new AttributeSource();
// this is currently too scary to deal with!
args[i] = null; // force IAE

View File

@ -50,7 +50,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT);
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
return new TokenStreamComponents(tokenizer);
}
};
@ -298,7 +298,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT);
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
TokenStream tokenStream = new MockGraphTokenFilter(random(), tokenizer);
return new TokenStreamComponents(tokenizer, tokenStream);
}

View File

@ -36,7 +36,7 @@ public class TestTypeTokenFilter extends BaseTokenStreamTestCase {
public void testTypeFilter() throws IOException {
StringReader reader = new StringReader("121 is palindrome, while 123 is not");
Set<String> stopTypes = asSet("<NUM>");
final StandardTokenizer input = new StandardTokenizer(TEST_VERSION_CURRENT);
final StandardTokenizer input = new StandardTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
input.setReader(reader);
TokenStream stream = new TypeTokenFilter(TEST_VERSION_CURRENT, input, stopTypes);
assertTokenStreamContents(stream, new String[]{"is", "palindrome", "while", "is", "not"});
@ -85,7 +85,7 @@ public class TestTypeTokenFilter extends BaseTokenStreamTestCase {
public void testTypeFilterWhitelist() throws IOException {
StringReader reader = new StringReader("121 is palindrome, while 123 is not");
Set<String> stopTypes = Collections.singleton("<NUM>");
final StandardTokenizer input = new StandardTokenizer(TEST_VERSION_CURRENT);
final StandardTokenizer input = new StandardTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
input.setReader(reader);
TokenStream stream = new TypeTokenFilter(TEST_VERSION_CURRENT, input, stopTypes, true);
assertTokenStreamContents(stream, new String[]{"121", "123"});

View File

@ -47,7 +47,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
sb.append(whitespace);
sb.append("testing 1234");
String input = sb.toString();
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT);
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
tokenizer.setReader(new StringReader(input));
BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
@ -56,7 +56,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT);
Tokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
return new TokenStreamComponents(tokenizer);
}
};
@ -103,7 +103,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
private Analyzer urlAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT);
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs
TokenFilter filter = new URLFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
@ -113,7 +113,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
private Analyzer emailAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT);
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
TokenFilter filter = new EmailFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}

View File

@ -42,7 +42,7 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new KeywordTokenizer();
Tokenizer t = new MockTokenizer(MockTokenizer.KEYWORD, false);
return new TokenStreamComponents(t,
new GermanStemFilter(new LowerCaseFilter(TEST_VERSION_CURRENT, t)));
}
@ -54,12 +54,6 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
vocOut.close();
}
// LUCENE-3043: we use keywordtokenizer in this test,
// so ensure the stemmer does not crash on zero-length strings.
public void testEmpty() throws Exception {
assertAnalyzesTo(analyzer, "", new String[] { "" });
}
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
Analyzer a = new Analyzer() {

View File

@ -24,6 +24,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
@ -37,9 +38,8 @@ public class TestGalicianStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT);
TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
return new TokenStreamComponents(source, new GalicianStemFilter(result));
Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new GalicianStemFilter(source));
}
};

View File

@ -34,8 +34,7 @@ public class TestHindiFilters extends BaseTokenStreamFactoryTestCase {
*/
public void testIndicNormalizer() throws Exception {
Reader reader = new StringReader("ত্‍ अाैर");
TokenStream stream = tokenizerFactory("Standard").create();
((Tokenizer)stream).setReader(reader);
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("IndicNormalization").create(stream);
assertTokenStreamContents(stream, new String[] { "", "और" });
}
@ -45,8 +44,7 @@ public class TestHindiFilters extends BaseTokenStreamFactoryTestCase {
*/
public void testHindiNormalizer() throws Exception {
Reader reader = new StringReader("क़िताब");
TokenStream stream = tokenizerFactory("Standard").create();
((Tokenizer)stream).setReader(reader);
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("IndicNormalization").create(stream);
stream = tokenFilterFactory("HindiNormalization").create(stream);
assertTokenStreamContents(stream, new String[] {"किताब"});
@ -57,8 +55,7 @@ public class TestHindiFilters extends BaseTokenStreamFactoryTestCase {
*/
public void testStemmer() throws Exception {
Reader reader = new StringReader("किताबें");
TokenStream stream = tokenizerFactory("Standard").create();
((Tokenizer)stream).setReader(reader);
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("IndicNormalization").create(stream);
stream = tokenFilterFactory("HindiNormalization").create(stream);
stream = tokenFilterFactory("HindiStem").create(stream);

View File

@ -22,6 +22,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
@ -33,7 +34,7 @@ public class TestIndonesianStemmer extends BaseTokenStreamTestCase {
Analyzer a = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer));
}
};
@ -114,7 +115,7 @@ public class TestIndonesianStemmer extends BaseTokenStreamTestCase {
Analyzer b = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer, false));
}
};

View File

@ -37,7 +37,8 @@ public class TestSingleTokenTokenFilter extends LuceneTestCase {
assertEquals(token, tokenAtt);
assertFalse(ts.incrementToken());
token = new Token("hallo", 10, 20, "someType");
token = new Token("hallo", 10, 20);
token.setType("someType");
ts.setToken(token);
ts.reset();

View File

@ -46,11 +46,11 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
char[] whitespace = " ".toCharArray();
char[] empty = "".toCharArray();
TokenStream ts = new IterTokenStream(new Token(a, 0, a.length, 1, 5),
new Token(b, 0, b.length, 6, 10),
new Token(ccc, 0, ccc.length, 11, 15),
new Token(whitespace, 0, whitespace.length, 16, 20),
new Token(empty, 0, empty.length, 21, 21));
TokenStream ts = new IterTokenStream(new Token(new String(a, 0, a.length), 1, 5),
new Token(new String(b, 0, b.length), 6, 10),
new Token(new String(ccc, 0, ccc.length), 11, 15),
new Token(new String(whitespace, 0, whitespace.length), 16, 20),
new Token(new String(empty, 0, empty.length), 21, 21));
ts = new TrimFilter(TEST_VERSION_CURRENT, ts);
assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""});

View File

@ -355,7 +355,8 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, protectedWords));
}
};
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
// TODO: properly support positionLengthAttribute
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
}
}
@ -379,7 +380,8 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, protectedWords));
}
};
checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192);
// TODO: properly support positionLengthAttribute
checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192, false, false);
}
}

View File

@ -27,11 +27,14 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import static org.apache.lucene.analysis.path.PathHierarchyTokenizer.DEFAULT_DELIMITER;
import static org.apache.lucene.analysis.path.PathHierarchyTokenizer.DEFAULT_SKIP;
public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testBasic() throws Exception {
String path = "/a/b/c";
PathHierarchyTokenizer t = new PathHierarchyTokenizer();
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
t.setReader(new StringReader(path));
assertTokenStreamContents(t,
new String[]{"/a", "/a/b", "/a/b/c"},
@ -43,7 +46,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testEndOfDelimiter() throws Exception {
String path = "/a/b/c/";
PathHierarchyTokenizer t = new PathHierarchyTokenizer();
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
t.setReader( new StringReader(path) );
assertTokenStreamContents(t,
new String[]{"/a", "/a/b", "/a/b/c", "/a/b/c/"},
@ -55,7 +58,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testStartOfChar() throws Exception {
String path = "a/b/c";
PathHierarchyTokenizer t = new PathHierarchyTokenizer();
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
t.setReader( new StringReader(path) );
assertTokenStreamContents(t,
new String[]{"a", "a/b", "a/b/c"},
@ -67,7 +70,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testStartOfCharEndOfDelimiter() throws Exception {
String path = "a/b/c/";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( );
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
t.setReader( new StringReader(path) );
assertTokenStreamContents(t,
new String[]{"a", "a/b", "a/b/c", "a/b/c/"},
@ -79,7 +82,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testOnlyDelimiter() throws Exception {
String path = "/";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( );
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
t.setReader( new StringReader(path) );
assertTokenStreamContents(t,
new String[]{"/"},
@ -91,7 +94,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testOnlyDelimiters() throws Exception {
String path = "//";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( );
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
t.setReader(new StringReader(path));
assertTokenStreamContents(t,
new String[]{"/", "//"},
@ -103,7 +106,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testReplace() throws Exception {
String path = "/a/b/c";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( '/', '\\' );
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), '/', '\\', DEFAULT_SKIP);
t.setReader(new StringReader(path));
assertTokenStreamContents(t,
new String[]{"\\a", "\\a\\b", "\\a\\b\\c"},
@ -115,7 +118,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testWindowsPath() throws Exception {
String path = "c:\\a\\b\\c";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( '\\', '\\' );
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), '\\', '\\', DEFAULT_SKIP);
t.setReader(new StringReader(path));
assertTokenStreamContents(t,
new String[]{"c:", "c:\\a", "c:\\a\\b", "c:\\a\\b\\c"},
@ -131,7 +134,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
NormalizeCharMap normMap = builder.build();
String path = "c:\\a\\b\\c";
Reader cs = new MappingCharFilter(normMap, new StringReader(path));
PathHierarchyTokenizer t = new PathHierarchyTokenizer( );
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
t.setReader(cs);
assertTokenStreamContents(t,
new String[]{"c:", "c:/a", "c:/a/b", "c:/a/b/c"},
@ -143,7 +146,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testBasicSkip() throws Exception {
String path = "/a/b/c";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( 1 );
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
t.setReader(new StringReader(path));
assertTokenStreamContents(t,
new String[]{"/b", "/b/c"},
@ -155,7 +158,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testEndOfDelimiterSkip() throws Exception {
String path = "/a/b/c/";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( 1 );
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
t.setReader(new StringReader(path));
assertTokenStreamContents(t,
new String[]{"/b", "/b/c", "/b/c/"},
@ -167,7 +170,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testStartOfCharSkip() throws Exception {
String path = "a/b/c";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( 1 );
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
t.setReader(new StringReader(path));
assertTokenStreamContents(t,
new String[]{"/b", "/b/c"},
@ -179,7 +182,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testStartOfCharEndOfDelimiterSkip() throws Exception {
String path = "a/b/c/";
PathHierarchyTokenizer t = new PathHierarchyTokenizer(1 );
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
t.setReader(new StringReader(path));
assertTokenStreamContents(t,
new String[]{"/b", "/b/c", "/b/c/"},
@ -191,7 +194,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testOnlyDelimiterSkip() throws Exception {
String path = "/";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( 1 );
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
t.setReader(new StringReader(path));
assertTokenStreamContents(t,
new String[]{},
@ -203,7 +206,7 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testOnlyDelimitersSkip() throws Exception {
String path = "//";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( 1 );
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
t.setReader( new StringReader(path));
assertTokenStreamContents(t,
new String[]{"/"},
@ -218,11 +221,12 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new PathHierarchyTokenizer();
Tokenizer tokenizer = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
// TODO: properly support positionLengthAttribute
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
}
/** blast some random large strings through the analyzer */
@ -231,10 +235,11 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new PathHierarchyTokenizer();
Tokenizer tokenizer = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 1027);
// TODO: properly support positionLengthAttribute
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 1027, false, false);
}
}

View File

@ -26,11 +26,14 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import static org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer.DEFAULT_DELIMITER;
import static org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer.DEFAULT_SKIP;
public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testBasicReverse() throws Exception {
String path = "/a/b/c";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer();
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
t.setReader(new StringReader(path));
assertTokenStreamContents(t,
new String[]{"/a/b/c", "a/b/c", "b/c", "c"},
@ -42,7 +45,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testEndOfDelimiterReverse() throws Exception {
String path = "/a/b/c/";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer();
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
t.setReader(new StringReader(path));
assertTokenStreamContents(t,
new String[]{"/a/b/c/", "a/b/c/", "b/c/", "c/"},
@ -54,7 +57,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testStartOfCharReverse() throws Exception {
String path = "a/b/c";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer();
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
t.setReader(new StringReader(path));
assertTokenStreamContents(t,
new String[]{"a/b/c", "b/c", "c"},
@ -66,7 +69,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testStartOfCharEndOfDelimiterReverse() throws Exception {
String path = "a/b/c/";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer();
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
t.setReader(new StringReader(path));
assertTokenStreamContents(t,
new String[]{"a/b/c/", "b/c/", "c/"},
@ -78,7 +81,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testOnlyDelimiterReverse() throws Exception {
String path = "/";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer();
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
t.setReader(new StringReader(path));
assertTokenStreamContents(t,
new String[]{"/"},
@ -90,7 +93,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testOnlyDelimitersReverse() throws Exception {
String path = "//";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer();
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
t.setReader(new StringReader(path));
assertTokenStreamContents(t,
new String[]{"//", "/"},
@ -102,7 +105,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testEndOfDelimiterReverseSkip() throws Exception {
String path = "/a/b/c/";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( 1 );
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
t.setReader(new StringReader(path));
new StringReader(path);
assertTokenStreamContents(t,
@ -115,7 +118,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testStartOfCharReverseSkip() throws Exception {
String path = "a/b/c";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( 1 );
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
t.setReader(new StringReader(path));
assertTokenStreamContents(t,
new String[]{"a/b/", "b/"},
@ -127,7 +130,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testStartOfCharEndOfDelimiterReverseSkip() throws Exception {
String path = "a/b/c/";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( 1 );
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
t.setReader(new StringReader(path));
assertTokenStreamContents(t,
new String[]{"a/b/", "b/"},
@ -139,7 +142,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testOnlyDelimiterReverseSkip() throws Exception {
String path = "/";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( 1 );
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
t.setReader(new StringReader(path));
assertTokenStreamContents(t,
new String[]{},
@ -151,7 +154,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testOnlyDelimitersReverseSkip() throws Exception {
String path = "//";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( 1 );
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 1);
t.setReader(new StringReader(path));
assertTokenStreamContents(t,
new String[]{"/"},
@ -163,7 +166,7 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testReverseSkip2() throws Exception {
String path = "/a/b/c/";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( 2 );
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, 2);
t.setReader( new StringReader(path));
assertTokenStreamContents(t,
new String[]{"/a/", "a/"},
@ -178,11 +181,12 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new ReversePathHierarchyTokenizer();
Tokenizer tokenizer = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
// TODO: properly support positionLengthAttribute
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
}
/** blast some random large strings through the analyzer */
@ -191,10 +195,11 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new ReversePathHierarchyTokenizer();
Tokenizer tokenizer = new ReversePathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 1027);
// TODO: properly support positionLengthAttribute
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 1027, false, false);
}
}

View File

@ -53,7 +53,7 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase
};
for( String[] test : tests ) {
TokenStream stream = new PatternTokenizer(Pattern.compile(test[1]), Integer.parseInt(test[0]));
TokenStream stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile(test[1]), Integer.parseInt(test[0]));
((Tokenizer)stream).setReader(new StringReader(test[2]));
String out = tsToString( stream );
// System.out.println( test[2] + " ==> " + out );
@ -86,7 +86,7 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase
CharFilter charStream = new MappingCharFilter( normMap, new StringReader( INPUT ) );
// create PatternTokenizer
Tokenizer stream = new PatternTokenizer(Pattern.compile("[,;/\\s]+"), -1);
Tokenizer stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("[,;/\\s]+"), -1);
stream.setReader(charStream);
assertTokenStreamContents(stream,
new String[] { "Günther", "Günther", "is", "here" },
@ -95,7 +95,7 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase
INPUT.length());
charStream = new MappingCharFilter( normMap, new StringReader( INPUT ) );
stream = new PatternTokenizer(Pattern.compile("Günther"), 0);
stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("Günther"), 0);
stream.setReader(charStream);
assertTokenStreamContents(stream,
new String[] { "Günther", "Günther" },
@ -132,7 +132,7 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new PatternTokenizer(Pattern.compile("a"), -1);
Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), -1);
return new TokenStreamComponents(tokenizer);
}
};
@ -141,7 +141,7 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase
Analyzer b = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new PatternTokenizer(Pattern.compile("a"), 0);
Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), 0);
return new TokenStreamComponents(tokenizer);
}
};

View File

@ -28,7 +28,7 @@ public class TestPatternTokenizerFactory extends BaseTokenStreamFactoryTestCase
public void testFactory() throws Exception {
final Reader reader = new StringReader("Günther Günther is here");
// create PatternTokenizer
Tokenizer stream = tokenizerFactory("Pattern", "pattern", "[,;/\\s]+").create();
Tokenizer stream = tokenizerFactory("Pattern", "pattern", "[,;/\\s]+").create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[] { "Günther", "Günther", "is", "here" });

View File

@ -18,18 +18,14 @@ package org.apache.lucene.analysis.pt;
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -41,9 +37,8 @@ public class TestPortugueseLightStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT);
TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
return new TokenStreamComponents(source, new PortugueseLightStemFilter(result));
Tokenizer source = new MockTokenizer(MockTokenizer.SIMPLE, true);
return new TokenStreamComponents(source, new PortugueseLightStemFilter(source));
}
};

View File

@ -18,18 +18,14 @@ package org.apache.lucene.analysis.pt;
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -41,9 +37,8 @@ public class TestPortugueseMinimalStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT);
TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(result));
Tokenizer source = new MockTokenizer(MockTokenizer.SIMPLE, true);
return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(source));
}
};

View File

@ -20,7 +20,6 @@ package org.apache.lucene.analysis.pt;
import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@ -28,9 +27,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
/**
@ -40,9 +37,8 @@ public class TestPortugueseStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT);
TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
return new TokenStreamComponents(source, new PortugueseStemFilter(result));
Tokenizer source = new MockTokenizer(MockTokenizer.SIMPLE, true);
return new TokenStreamComponents(source, new PortugueseStemFilter(source));
}
};

View File

@ -1096,7 +1096,8 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
private static Token createToken
(String term, int start, int offset, int positionIncrement)
{
Token token = new Token(start, offset);
Token token = new Token();
token.setOffset(start, offset);
token.copyBuffer(term.toCharArray(), 0, term.length());
token.setPositionIncrement(positionIncrement);
return token;

View File

@ -151,7 +151,9 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
dogDetector.addAttribute(CheckClearAttributesAttribute.class);
theDetector.addAttribute(CheckClearAttributesAttribute.class);
final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(whitespaceMockTokenizer(buffer2.toString()));
MockTokenizer tokenizer = new MockTokenizer(tee1.getAttributeFactory(), MockTokenizer.WHITESPACE, false);
tokenizer.setReader(new StringReader(buffer2.toString()));
final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(tokenizer);
tee2.addSinkTokenStream(dogDetector);
tee2.addSinkTokenStream(theDetector);
final TokenStream source2 = tee2;

View File

@ -34,7 +34,7 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
*/
public void testStandardTokenizer() throws Exception {
Reader reader = new StringReader("Wha\u0301t's this thing do?");
Tokenizer stream = tokenizerFactory("Standard").create();
Tokenizer stream = tokenizerFactory("Standard").create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[]{"Wha\u0301t's", "this", "thing", "do"});
@ -49,7 +49,7 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
String content = "one two three " + longWord + " four five six";
Reader reader = new StringReader(content);
Tokenizer stream = tokenizerFactory("Standard",
"maxTokenLength", "1000").create();
"maxTokenLength", "1000").create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[]{"one", "two", "three", longWord, "four", "five", "six"});
@ -60,7 +60,7 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
*/
public void testClassicTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
Tokenizer stream = tokenizerFactory("Classic").create();
Tokenizer stream = tokenizerFactory("Classic").create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[]{"What's", "this", "thing", "do"});
@ -75,7 +75,7 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
String content = "one two three " + longWord + " four five six";
Reader reader = new StringReader(content);
Tokenizer stream = tokenizerFactory("Classic",
"maxTokenLength", "1000").create();
"maxTokenLength", "1000").create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[]{"one", "two", "three", longWord, "four", "five", "six"});
@ -86,7 +86,7 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
*/
public void testStandardFilter() throws Exception {
Reader reader = new StringReader("What's this thing do?");
Tokenizer tokenizer = tokenizerFactory("Classic").create();
Tokenizer tokenizer = tokenizerFactory("Classic").create(newAttributeFactory());
tokenizer.setReader(reader);
TokenStream stream = tokenFilterFactory("Classic").create(tokenizer);
assertTokenStreamContents(stream,
@ -109,7 +109,7 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
*/
public void testWhitespaceTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
Tokenizer stream = tokenizerFactory("Whitespace").create();
Tokenizer stream = tokenizerFactory("Whitespace").create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[] { "What's", "this", "thing", "do?" });
@ -120,7 +120,7 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
*/
public void testLetterTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
Tokenizer stream = tokenizerFactory("Letter").create();
Tokenizer stream = tokenizerFactory("Letter").create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[] { "What", "s", "this", "thing", "do" });
@ -131,7 +131,7 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
*/
public void testLowerCaseTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
Tokenizer stream = tokenizerFactory("LowerCase").create();
Tokenizer stream = tokenizerFactory("LowerCase").create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[] { "what", "s", "this", "thing", "do" });

View File

@ -31,7 +31,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
public void testUAX29URLEmailTokenizer() throws Exception {
Reader reader = new StringReader("Wha\u0301t's this thing do?");
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create();
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[] { "Wha\u0301t's", "this", "thing", "do" });
@ -39,7 +39,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
public void testArabic() throws Exception {
Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.");
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create();
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
@ -48,7 +48,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
public void testChinese() throws Exception {
Reader reader = new StringReader("我是中国人。 ");
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create();
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[] { "", "", "", "", "", "", "" });
@ -56,7 +56,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
public void testKorean() throws Exception {
Reader reader = new StringReader("안녕하세요 한글입니다");
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create();
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[] { "안녕하세요", "한글입니다" });
@ -64,7 +64,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
public void testHyphen() throws Exception {
Reader reader = new StringReader("some-dashed-phrase");
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create();
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[] { "some", "dashed", "phrase" });
@ -87,7 +87,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
+ " blah Sirrah woof "
+ "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4\n";
Reader reader = new StringReader(textWithURLs);
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create();
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[] {
@ -126,7 +126,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
+ "lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae\n"
+ "lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H\n";
Reader reader = new StringReader(textWithEmails);
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create();
Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[] {
@ -157,7 +157,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
String content = "one two three " + longWord + " four five six";
Reader reader = new StringReader(content);
Tokenizer stream = tokenizerFactory("UAX29URLEmail",
"maxTokenLength", "1000").create();
"maxTokenLength", "1000").create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[] {"one", "two", "three", longWord, "four", "five", "six" });

View File

@ -31,7 +31,7 @@ public class TestThaiTokenizerFactory extends BaseTokenStreamFactoryTestCase {
*/
public void testWordBreak() throws Exception {
assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiTokenizer.DBBI_AVAILABLE);
Tokenizer tokenizer = tokenizerFactory("Thai").create();
Tokenizer tokenizer = tokenizerFactory("Thai").create(newAttributeFactory());
tokenizer.setReader(new StringReader("การที่ได้ต้องแสดงว่างานดี"));
assertTokenStreamContents(tokenizer, new String[] {"การ", "ที่", "ได้",
"ต้อง", "แสดง", "ว่า", "งาน", "ดี"});

View File

@ -52,7 +52,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
}
// internal buffer size is 1024 make sure we have a surrogate pair right at the border
builder.insert(1023, "\ud801\udc1c");
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT);
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, builder.toString().toLowerCase(Locale.ROOT).split(" "));
}
@ -70,7 +70,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
builder.append("a");
}
builder.append("\ud801\udc1cabc");
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT);
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT)});
}
@ -85,7 +85,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
for (int i = 0; i < 255; i++) {
builder.append("A");
}
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT);
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
@ -100,7 +100,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
builder.append("A");
}
builder.append("\ud801\udc1c");
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT);
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
@ -110,7 +110,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT) {
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, newAttributeFactory()) {
@Override
protected int normalize(int c) {
if (c > 0xffff) {
@ -148,7 +148,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT) {
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, newAttributeFactory()) {
@Override
protected int normalize(int c) {
if (c <= 0xffff) {

View File

@ -40,7 +40,7 @@ public class TestElision extends BaseTokenStreamTestCase {
public void testElision() throws Exception {
String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT);
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, newAttributeFactory());
tokenizer.setReader(new StringReader(test));
CharArraySet articles = new CharArraySet(TEST_VERSION_CURRENT, asSet("l", "M"), false);
TokenFilter filter = new ElisionFilter(tokenizer, articles);

View File

@ -140,7 +140,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
public WholeSentenceTokenizer() {
super(BreakIterator.getSentenceInstance(Locale.ROOT));
super(newAttributeFactory(), BreakIterator.getSentenceInstance(Locale.ROOT));
}
@Override
@ -178,7 +178,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
public SentenceAndWordTokenizer() {
super(BreakIterator.getSentenceInstance(Locale.ROOT));
super(newAttributeFactory(), BreakIterator.getSentenceInstance(Locale.ROOT));
}
@Override

View File

@ -30,7 +30,7 @@ import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
public class TestWikipediaTokenizerFactory extends BaseTokenStreamFactoryTestCase {
public void testTokenizer() throws Exception {
Reader reader = new StringReader("This is a [[Category:foo]]");
Tokenizer tokenizer = tokenizerFactory("Wikipedia").create();
Tokenizer tokenizer = tokenizerFactory("Wikipedia").create(newAttributeFactory());
tokenizer.setReader(reader);
assertTokenStreamContents(tokenizer,
new String[] { "This", "is", "a", "foo" },

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.wikipedia;
import java.io.StringReader;
import java.io.IOException;
import java.util.Collections;
import java.util.Random;
import java.util.Set;
import java.util.HashSet;
@ -39,7 +40,7 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
public void testSimple() throws Exception {
String text = "This is a [[Category:foo]]";
WikipediaTokenizer tf = new WikipediaTokenizer();
WikipediaTokenizer tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
tf.setReader(new StringReader(text));
assertTokenStreamContents(tf,
new String[] { "This", "is", "a", "foo" },
@ -62,7 +63,7 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
+ " [http://foo.boo.com/test/test/ Test Test] [http://foo.boo.com/test/test/test.html Test Test]"
+ " [http://foo.boo.com/test/test/test.html?g=b&c=d Test Test] <ref>Citation</ref> <sup>martian</sup> <span class=\"glue\">code</span>";
WikipediaTokenizer tf = new WikipediaTokenizer();
WikipediaTokenizer tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
tf.setReader(new StringReader(test));
assertTokenStreamContents(tf,
new String[] {"link", "This", "is", "a",
@ -104,7 +105,7 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
}
public void testLinkPhrases() throws Exception {
WikipediaTokenizer tf = new WikipediaTokenizer();
WikipediaTokenizer tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
tf.setReader(new StringReader(LINK_PHRASES));
checkLinkPhrases(tf);
}
@ -118,7 +119,7 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
public void testLinks() throws Exception {
String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]";
WikipediaTokenizer tf = new WikipediaTokenizer();
WikipediaTokenizer tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
tf.setReader(new StringReader(test));
assertTokenStreamContents(tf,
new String[] { "http://lucene.apache.org/java/docs/index.html#news", "here",
@ -134,7 +135,7 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
untoks.add(WikipediaTokenizer.CATEGORY);
untoks.add(WikipediaTokenizer.ITALICS);
//should be exactly the same, regardless of untoks
WikipediaTokenizer tf = new WikipediaTokenizer(WikipediaTokenizer.TOKENS_ONLY, untoks);
WikipediaTokenizer tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, untoks);
tf.setReader(new StringReader(LINK_PHRASES));
checkLinkPhrases(tf);
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
@ -155,7 +156,7 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
untoks.add(WikipediaTokenizer.ITALICS);
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
//should output all the indivual tokens plus the untokenized tokens as well. Untokenized tokens
WikipediaTokenizer tf = new WikipediaTokenizer(WikipediaTokenizer.BOTH, untoks);
WikipediaTokenizer tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.BOTH, untoks);
tf.setReader(new StringReader(test));
assertTokenStreamContents(tf,
new String[] { "a b c d", "a", "b", "c", "d", "e f g", "e", "f", "g",
@ -167,7 +168,7 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
);
// now check the flags, TODO: add way to check flags from BaseTokenStreamTestCase?
tf = new WikipediaTokenizer(WikipediaTokenizer.BOTH, untoks);
tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.BOTH, untoks);
tf.setReader(new StringReader(test));
int expectedFlags[] = new int[] { UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0,
0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0 };
@ -187,11 +188,12 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new WikipediaTokenizer();
Tokenizer tokenizer = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
// TODO: properly support positionLengthAttribute
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
}
/** blast some random large strings through the analyzer */
@ -201,10 +203,11 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new WikipediaTokenizer();
Tokenizer tokenizer = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192);
// TODO: properly support positionLengthAttribute
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192, false, false);
}
}

View File

@ -77,7 +77,7 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
Tokenizer tokenStream = new NGramTokenizer(TEST_VERSION_CURRENT, 1, 1);
Tokenizer tokenStream = new NGramTokenizer(TEST_VERSION_CURRENT, newAttributeFactory(), 1, 1);
tokenStream.setReader(reader);
assertTokenStreamContents(tokenStream,

View File

@ -42,7 +42,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
sb.append(whitespace);
sb.append("testing 1234");
String input = sb.toString();
ICUTokenizer tokenizer = new ICUTokenizer(new DefaultICUTokenizerConfig(false));
ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
tokenizer.setReader(new StringReader(input));
assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
@ -53,7 +53,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
sb.append('a');
}
String input = sb.toString();
ICUTokenizer tokenizer = new ICUTokenizer(new DefaultICUTokenizerConfig(false));
ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
tokenizer.setReader(new StringReader(input));
char token[] = new char[4096];
Arrays.fill(token, 'a');
@ -70,7 +70,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
private Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new ICUTokenizer(new DefaultICUTokenizerConfig(false));
Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}

View File

@ -30,7 +30,7 @@ public class TestICUTokenizerCJK extends BaseTokenStreamTestCase {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(new ICUTokenizer());
return new TokenStreamComponents(new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(true)));
}
};

View File

@ -33,7 +33,7 @@ public class TestICUTokenizerFactory extends BaseTokenStreamTestCase {
Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี This is a test ກວ່າດອກ");
ICUTokenizerFactory factory = new ICUTokenizerFactory(new HashMap<String,String>());
factory.inform(new ClasspathResourceLoader(getClass()));
Tokenizer stream = factory.create();
Tokenizer stream = factory.create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี",
@ -48,7 +48,7 @@ public class TestICUTokenizerFactory extends BaseTokenStreamTestCase {
args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-break-only-on-whitespace.rbbi");
ICUTokenizerFactory factory = new ICUTokenizerFactory(args);
factory.inform(new ClasspathResourceLoader(this.getClass()));
Tokenizer stream = factory.create();
Tokenizer stream = factory.create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[] { "Don't,break.at?/(punct)!", "\u201Cnice\u201D", "85_At:all;", "`really\"", "+2=3$5,&813", "!@#%$^)(*@#$" },
@ -62,7 +62,7 @@ public class TestICUTokenizerFactory extends BaseTokenStreamTestCase {
args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-dont-break-on-hyphens.rbbi");
ICUTokenizerFactory factory = new ICUTokenizerFactory(args);
factory.inform(new ClasspathResourceLoader(getClass()));
Tokenizer stream = factory.create();
Tokenizer stream = factory.create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream,
new String[] { "One-two", "punch",
@ -82,7 +82,7 @@ public class TestICUTokenizerFactory extends BaseTokenStreamTestCase {
args.put(ICUTokenizerFactory.RULEFILES, "Cyrl:KeywordTokenizer.rbbi,Thai:KeywordTokenizer.rbbi");
ICUTokenizerFactory factory = new ICUTokenizerFactory(args);
factory.inform(new ClasspathResourceLoader(getClass()));
Tokenizer stream = factory.create();
Tokenizer stream = factory.create(newAttributeFactory());
stream.setReader(reader);
assertTokenStreamContents(stream, new String[] { "Some", "English",
"Немного русский. ",

View File

@ -41,7 +41,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new ICUTokenizer(new DefaultICUTokenizerConfig(false));
Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
TokenStream result = new CJKBigramFilter(source);
return new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET));
}
@ -56,7 +56,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer2 = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new ICUTokenizer(new DefaultICUTokenizerConfig(false));
Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
// we put this before the CJKBigramFilter, because the normalization might combine
// some halfwidth katakana forms, which will affect the bigramming.
TokenStream result = new ICUNormalizer2Filter(source);

View File

@ -36,7 +36,7 @@ public class TestExtendedMode extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new JapaneseTokenizer(null, true, Mode.EXTENDED);
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, true, Mode.EXTENDED);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};

View File

@ -32,7 +32,7 @@ public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.DEFAULT_MODE);
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, true, JapaneseTokenizer.DEFAULT_MODE);
return new TokenStreamComponents(tokenizer, new JapaneseBaseFormFilter(tokenizer));
}
};
@ -48,7 +48,7 @@ public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new JapaneseTokenizer(null, true, JapaneseTokenizer.DEFAULT_MODE);
Tokenizer source = new JapaneseTokenizer(newAttributeFactory(), null, true, JapaneseTokenizer.DEFAULT_MODE);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new JapaneseBaseFormFilter(sink));
}

View File

@ -32,7 +32,7 @@ public class TestJapaneseBaseFormFilterFactory extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>());
tokenizerFactory.inform(new StringMockResourceLoader(""));
TokenStream ts = tokenizerFactory.create();
TokenStream ts = tokenizerFactory.create(newAttributeFactory());
((Tokenizer)ts).setReader(new StringReader("それはまだ実験段階にあります"));
JapaneseBaseFormFilterFactory factory = new JapaneseBaseFormFilterFactory(new HashMap<String,String>());
ts = factory.create(ts);

View File

@ -45,7 +45,7 @@ public class TestJapaneseIterationMarkCharFilter extends BaseTokenStreamTestCase
private Analyzer japaneseAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new JapaneseTokenizer(null, false, JapaneseTokenizer.Mode.SEARCH);
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH);
return new TokenStreamComponents(tokenizer, tokenizer);
}

View File

@ -50,7 +50,7 @@ public class TestJapaneseIterationMarkCharFilterFactory extends BaseTokenStreamT
CharFilter filter = filterFactory.create(
new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
);
TokenStream tokenStream = tokenizerFactory.create();
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
((Tokenizer)tokenStream).setReader(filter);
assertTokenStreamContents(tokenStream, new String[]{"時時", "馬鹿馬鹿しい", "ところどころ", "", "スズ"});
}
@ -67,7 +67,7 @@ public class TestJapaneseIterationMarkCharFilterFactory extends BaseTokenStreamT
CharFilter filter = filterFactory.create(
new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
);
TokenStream tokenStream = tokenizerFactory.create();
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
((Tokenizer)tokenStream).setReader(filter);
assertTokenStreamContents(tokenStream, new String[]{"時時", "馬鹿馬鹿しい", "ところ", "", "", "", "ミス", ""});
}
@ -84,7 +84,7 @@ public class TestJapaneseIterationMarkCharFilterFactory extends BaseTokenStreamT
CharFilter filter = filterFactory.create(
new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
);
TokenStream tokenStream = tokenizerFactory.create();
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
((Tokenizer)tokenStream).setReader(filter);
assertTokenStreamContents(tokenStream, new String[]{"時々", "馬鹿", "", "", "しい", "ところどころ", "", "スズ"});
}

View File

@ -32,7 +32,7 @@ public class TestJapaneseKatakanaStemFilterFactory extends BaseTokenStreamTestCa
public void testKatakanaStemming() throws IOException {
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>());
tokenizerFactory.inform(new StringMockResourceLoader(""));
TokenStream tokenStream = tokenizerFactory.create();
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
((Tokenizer)tokenStream).setReader(new StringReader("明後日パーティーに行く予定がある。図書館で資料をコピーしました。"));
JapaneseKatakanaStemFilterFactory filterFactory = new JapaneseKatakanaStemFilterFactory(new HashMap<String,String>());;
assertTokenStreamContents(filterFactory.create(tokenStream),

View File

@ -35,7 +35,7 @@ public class TestJapaneseReadingFormFilter extends BaseTokenStreamTestCase {
private Analyzer katakanaAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, true, JapaneseTokenizer.Mode.SEARCH);
return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(tokenizer, false));
}
};
@ -43,7 +43,7 @@ public class TestJapaneseReadingFormFilter extends BaseTokenStreamTestCase {
private Analyzer romajiAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, true, JapaneseTokenizer.Mode.SEARCH);
return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(tokenizer, true));
}
};
@ -59,7 +59,7 @@ public class TestJapaneseReadingFormFilter extends BaseTokenStreamTestCase {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, true, JapaneseTokenizer.Mode.SEARCH);
TokenStream stream = new CJKWidthFilter(tokenizer);
return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, false));
}
@ -79,7 +79,7 @@ public class TestJapaneseReadingFormFilter extends BaseTokenStreamTestCase {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, true, JapaneseTokenizer.Mode.SEARCH);
TokenStream stream = new CJKWidthFilter(tokenizer);
return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, true));
}

View File

@ -62,7 +62,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new JapaneseTokenizer(readDict(), false, Mode.SEARCH);
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), readDict(), false, Mode.SEARCH);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
@ -70,7 +70,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
private Analyzer analyzerNormal = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new JapaneseTokenizer(readDict(), false, Mode.NORMAL);
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), readDict(), false, Mode.NORMAL);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
@ -78,7 +78,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
private Analyzer analyzerNoPunct = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new JapaneseTokenizer(readDict(), true, Mode.SEARCH);
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), readDict(), true, Mode.SEARCH);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
@ -86,7 +86,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
private Analyzer extendedModeAnalyzerNoPunct = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new JapaneseTokenizer(readDict(), true, Mode.EXTENDED);
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), readDict(), true, Mode.EXTENDED);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
@ -202,7 +202,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new JapaneseTokenizer(readDict(), false, Mode.SEARCH);
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), readDict(), false, Mode.SEARCH);
TokenStream graph = new MockGraphTokenFilter(random(), tokenizer);
return new TokenStreamComponents(tokenizer, graph);
}
@ -352,7 +352,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
final Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
JapaneseTokenizer tokenizer = new JapaneseTokenizer(readDict(), false, Mode.SEARCH);
JapaneseTokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), readDict(), false, Mode.SEARCH);
tokenizer.setGraphvizFormatter(gv2);
return new TokenStreamComponents(tokenizer, tokenizer);
}

View File

@ -34,7 +34,7 @@ public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase {
public void testSimple() throws IOException {
JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(new HashMap<String,String>());
factory.inform(new StringMockResourceLoader(""));
TokenStream ts = factory.create();
TokenStream ts = factory.create(newAttributeFactory());
((Tokenizer)ts).setReader(new StringReader("これは本ではない"));
assertTokenStreamContents(ts,
new String[] { "これ", "", "", "", "", "ない" },
@ -49,7 +49,7 @@ public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase {
public void testDefaults() throws IOException {
JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(new HashMap<String,String>());
factory.inform(new StringMockResourceLoader(""));
TokenStream ts = factory.create();
TokenStream ts = factory.create(newAttributeFactory());
((Tokenizer)ts).setReader(new StringReader("シニアソフトウェアエンジニア"));
assertTokenStreamContents(ts,
new String[] { "シニア", "シニアソフトウェアエンジニア", "ソフトウェア", "エンジニア" }
@ -64,7 +64,7 @@ public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase {
args.put("mode", "normal");
JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args);
factory.inform(new StringMockResourceLoader(""));
TokenStream ts = factory.create();
TokenStream ts = factory.create(newAttributeFactory());
((Tokenizer)ts).setReader(new StringReader("シニアソフトウェアエンジニア"));
assertTokenStreamContents(ts,
new String[] { "シニアソフトウェアエンジニア" }
@ -85,7 +85,7 @@ public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase {
args.put("userDictionary", "userdict.txt");
JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args);
factory.inform(new StringMockResourceLoader(userDict));
TokenStream ts = factory.create();
TokenStream ts = factory.create(newAttributeFactory());
((Tokenizer)ts).setReader(new StringReader("関西国際空港に行った"));
assertTokenStreamContents(ts,
new String[] { "関西", "国際", "空港", "", "行っ", "" }
@ -100,7 +100,7 @@ public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase {
args.put("discardPunctuation", "false");
JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args);
factory.inform(new StringMockResourceLoader(""));
TokenStream ts = factory.create();
TokenStream ts = factory.create(newAttributeFactory());
((Tokenizer)ts).setReader(new StringReader("今ノルウェーにいますが、来週の頭日本に戻ります。楽しみにしています!お寿司が食べたいな。。。"));
assertTokenStreamContents(ts,
new String[] { "", "ノルウェー", "", "", "ます", "", "",

View File

@ -34,7 +34,7 @@ public class TestSearchMode extends BaseTokenStreamTestCase {
private final Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new JapaneseTokenizer(null, true, Mode.SEARCH);
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, true, Mode.SEARCH);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};

View File

@ -17,7 +17,6 @@
package org.apache.lucene.analysis.phonetic;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@ -25,54 +24,47 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.util.TestUtil;
public class DoubleMetaphoneFilterTest extends BaseTokenStreamTestCase {
private TokenStream whitespaceTokenizer(String data) throws IOException {
WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT);
whitespaceTokenizer.setReader(new StringReader(data));
return whitespaceTokenizer;
}
public void testSize4FalseInject() throws Exception {
TokenStream stream = whitespaceTokenizer("international");
TokenStream stream = whitespaceMockTokenizer("international");
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
assertTokenStreamContents(filter, new String[] { "ANTR" });
}
public void testSize4TrueInject() throws Exception {
TokenStream stream = whitespaceTokenizer("international");
TokenStream stream = whitespaceMockTokenizer("international");
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
assertTokenStreamContents(filter, new String[] { "international", "ANTR" });
}
public void testAlternateInjectFalse() throws Exception {
TokenStream stream = whitespaceTokenizer("Kuczewski");
TokenStream stream = whitespaceMockTokenizer("Kuczewski");
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" });
}
public void testSize8FalseInject() throws Exception {
TokenStream stream = whitespaceTokenizer("international");
TokenStream stream = whitespaceMockTokenizer("international");
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
assertTokenStreamContents(filter, new String[] { "ANTRNXNL" });
}
public void testNonConvertableStringsWithInject() throws Exception {
TokenStream stream = whitespaceTokenizer("12345 #$%@#^%&");
TokenStream stream = whitespaceMockTokenizer("12345 #$%@#^%&");
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
}
public void testNonConvertableStringsWithoutInject() throws Exception {
TokenStream stream = whitespaceTokenizer("12345 #$%@#^%&");
TokenStream stream = whitespaceMockTokenizer("12345 #$%@#^%&");
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
// should have something after the stream
stream = whitespaceTokenizer("12345 #$%@#^%& hello");
stream = whitespaceMockTokenizer("12345 #$%@#^%& hello");
filter = new DoubleMetaphoneFilter(stream, 8, false);
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
}

View File

@ -110,7 +110,7 @@ public class TestBeiderMorseFilter extends BaseTokenStreamTestCase {
}
public void testCustomAttribute() throws IOException {
TokenStream stream = new KeywordTokenizer();
TokenStream stream = new MockTokenizer(MockTokenizer.KEYWORD, false);
((Tokenizer)stream).setReader(new StringReader("D'Angelo"));
stream = new PatternKeywordMarkerFilter(stream, Pattern.compile(".*"));
stream = new BeiderMorseFilter(stream, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true));

View File

@ -34,7 +34,7 @@ public class TestHMMChineseTokenizerFactory extends BaseTokenStreamTestCase {
public void testSimple() throws Exception {
Reader reader = new StringReader("我购买了道具和服装。");
TokenizerFactory factory = new HMMChineseTokenizerFactory(new HashMap<String,String>());
Tokenizer tokenizer = factory.create();
Tokenizer tokenizer = factory.create(newAttributeFactory());
tokenizer.setReader(reader);
// TODO: fix smart chinese to not emit punctuation tokens
// at the moment: you have to clean up with WDF, or use the stoplist, etc

View File

@ -40,16 +40,9 @@ public class TestToken extends LuceneTestCase {
assertEquals("word", t.type());
assertEquals(0, t.getFlags());
t = new Token(6, 22);
t.copyBuffer(content, 0, content.length);
assertEquals("hello", t.toString());
assertEquals("hello", t.toString());
assertEquals(6, t.startOffset());
assertEquals(22, t.endOffset());
assertEquals("word", t.type());
assertEquals(0, t.getFlags());
t = new Token(6, 22, 7);
t = new Token();
t.setOffset(6, 22);
t.setFlags(7);
t.copyBuffer(content, 0, content.length);
assertEquals("hello", t.toString());
assertEquals("hello", t.toString());
@ -58,7 +51,9 @@ public class TestToken extends LuceneTestCase {
assertEquals("word", t.type());
assertEquals(7, t.getFlags());
t = new Token(6, 22, "junk");
t = new Token();
t.setOffset(6, 22);
t.setType("junk");
t.copyBuffer(content, 0, content.length);
assertEquals("hello", t.toString());
assertEquals("hello", t.toString());
@ -174,7 +169,8 @@ public class TestToken extends LuceneTestCase {
}
public void testClone() throws Exception {
Token t = new Token(0, 5);
Token t = new Token();
t.setOffset(0, 5);
char[] content = "hello".toCharArray();
t.copyBuffer(content, 0, 5);
char[] buf = t.buffer();
@ -195,7 +191,8 @@ public class TestToken extends LuceneTestCase {
assertEquals("", t.toString());
assertEquals("", copy.toString());
t = new Token(0, 5);
t = new Token();
t.setOffset(0, 5);
char[] content = "hello".toCharArray();
t.copyBuffer(content, 0, 5);
char[] buf = t.buffer();
@ -245,7 +242,8 @@ public class TestToken extends LuceneTestCase {
}
public void testAttributeReflection() throws Exception {
Token t = new Token("foobar", 6, 22, 8);
Token t = new Token("foobar", 6, 22);
t.setFlags(8);
TestUtil.assertAttributeReflection(t,
new HashMap<String, Object>() {{
put(CharTermAttribute.class.getName() + "#term", "foobar");

View File

@ -591,7 +591,6 @@ public class TestPayloads extends LuceneTestCase {
Field field = new TextField("field", "", Field.Store.NO);
TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer)ts).setReader(new StringReader("here we go"));
assertFalse(ts.hasAttribute(PayloadAttribute.class));
field.setTokenStream(ts);
doc.add(field);
writer.addDocument(doc);
@ -603,7 +602,6 @@ public class TestPayloads extends LuceneTestCase {
writer.addDocument(doc);
ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer)ts).setReader(new StringReader("another"));
assertFalse(ts.hasAttribute(PayloadAttribute.class));
field.setTokenStream(ts);
writer.addDocument(doc);
DirectoryReader reader = writer.getReader();
@ -625,7 +623,6 @@ public class TestPayloads extends LuceneTestCase {
Field field = new TextField("field", "", Field.Store.NO);
TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer)ts).setReader(new StringReader("here we go"));
assertFalse(ts.hasAttribute(PayloadAttribute.class));
field.setTokenStream(ts);
doc.add(field);
Field field2 = new TextField("field", "", Field.Store.NO);
@ -638,8 +635,6 @@ public class TestPayloads extends LuceneTestCase {
Field field3 = new TextField("field", "", Field.Store.NO);
ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer)ts).setReader(new StringReader("nopayload"));
assertFalse(ts.hasAttribute(PayloadAttribute.class));
field3.setTokenStream(ts);
doc.add(field3);
writer.addDocument(doc);

View File

@ -51,7 +51,6 @@ public class TestPayloadsOnVectors extends LuceneTestCase {
Field field = new Field("field", "", customType);
TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer)ts).setReader(new StringReader("here we go"));
assertFalse(ts.hasAttribute(PayloadAttribute.class));
field.setTokenStream(ts);
doc.add(field);
writer.addDocument(doc);
@ -65,7 +64,6 @@ public class TestPayloadsOnVectors extends LuceneTestCase {
ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer)ts).setReader(new StringReader("another"));
assertFalse(ts.hasAttribute(PayloadAttribute.class));
field.setTokenStream(ts);
writer.addDocument(doc);
@ -96,7 +94,6 @@ public class TestPayloadsOnVectors extends LuceneTestCase {
Field field = new Field("field", "", customType);
TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer)ts).setReader(new StringReader("here we go"));
assertFalse(ts.hasAttribute(PayloadAttribute.class));
field.setTokenStream(ts);
doc.add(field);
Field field2 = new Field("field", "", customType);
@ -109,7 +106,6 @@ public class TestPayloadsOnVectors extends LuceneTestCase {
Field field3 = new Field("field", "", customType);
ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer)ts).setReader(new StringReader("nopayload"));
assertFalse(ts.hasAttribute(PayloadAttribute.class));
field3.setTokenStream(ts);
doc.add(field3);
writer.addDocument(doc);

View File

@ -67,7 +67,8 @@ public class TokenGroup {
tot += score;
}
}
Token token = new Token(termStartOffset, termEndOffset);
Token token = new Token();
token.setOffset(termStartOffset, termEndOffset);
token.setEmpty().append(termAtt);
tokens[numTokens] = token;
scores[numTokens] = score;

View File

@ -314,10 +314,10 @@ public class HighlighterPhraseTest extends LuceneTestCase {
public void reset() {
this.i = -1;
this.tokens = new Token[] {
new Token(new char[] { 't', 'h', 'e' }, 0, 3, 0, 3),
new Token(new char[] { 'f', 'o', 'x' }, 0, 3, 4, 7),
new Token(new char[] { 'd', 'i', 'd' }, 0, 3, 8, 11),
new Token(new char[] { 'j', 'u', 'm', 'p' }, 0, 4, 16, 20) };
new Token("the", 0, 3),
new Token("fox", 4, 7),
new Token("did", 8, 11),
new Token("jump", 16, 20) };
this.tokens[3].setPositionIncrement(2);
}
}
@ -354,10 +354,10 @@ public class HighlighterPhraseTest extends LuceneTestCase {
public void reset() {
this.i = -1;
this.tokens = new Token[] {
new Token(new char[] { 't', 'h', 'e' }, 0, 3, 0, 3),
new Token(new char[] { 'f', 'o', 'x' }, 0, 3, 4, 7),
new Token(new char[] { 'j', 'u', 'm', 'p' }, 0, 4, 8, 14),
new Token(new char[] { 'j', 'u', 'm', 'p', 'e', 'd' }, 0, 6, 8, 14) };
new Token("the", 0, 3),
new Token("fox", 4, 7),
new Token("jump", 8, 14),
new Token("jumped", 8, 14) };
this.tokens[3].setPositionIncrement(0);
}
}

View File

@ -2013,7 +2013,8 @@ final class SynonymTokenizer extends TokenStream {
}
st = new StringTokenizer(expansions, ",");
if (st.hasMoreTokens()) {
currentRealToken = new Token(realOffsetAtt.startOffset(), realOffsetAtt.endOffset());
currentRealToken = new Token();
currentRealToken.setOffset(realOffsetAtt.startOffset(), realOffsetAtt.endOffset());
currentRealToken.copyBuffer(realTermAtt.buffer(), 0, realTermAtt.length());
}

View File

@ -78,12 +78,12 @@ public class TokenSourcesTest extends LuceneTestCase {
public void reset() {
this.i = -1;
this.tokens = new Token[] {
new Token(new char[] {'t', 'h', 'e'}, 0, 3, 0, 3),
new Token(new char[] {'{', 'f', 'o', 'x', '}'}, 0, 5, 0, 7),
new Token(new char[] {'f', 'o', 'x'}, 0, 3, 4, 7),
new Token(new char[] {'d', 'i', 'd'}, 0, 3, 8, 11),
new Token(new char[] {'n', 'o', 't'}, 0, 3, 12, 15),
new Token(new char[] {'j', 'u', 'm', 'p'}, 0, 4, 16, 20)};
new Token("the", 0, 3),
new Token("{fox}", 0, 7),
new Token("fox", 4, 7),
new Token("did", 8, 11),
new Token("not", 12, 15),
new Token("jump", 16, 20)};
this.tokens[1].setPositionIncrement(0);
}
}

View File

@ -39,6 +39,7 @@ import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeSource.AttributeFactory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase;
@ -933,5 +934,18 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
mockTokenizer.setReader(new StringReader(input));
return mockTokenizer;
}
/** Returns a new AttributeFactory impl */
public static AttributeFactory newAttributeFactory(Random random) {
if (random.nextBoolean()) {
return Token.TOKEN_ATTRIBUTE_FACTORY;
} else {
return AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
}
}
/** Returns a new AttributeFactory impl */
public static AttributeFactory newAttributeFactory() {
return newAttributeFactory(random());
}
}

View File

@ -98,7 +98,7 @@ public class MockTokenizer extends Tokenizer {
}
public MockTokenizer(CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, runAutomaton, lowerCase, maxTokenLength);
this(BaseTokenStreamTestCase.newAttributeFactory(), runAutomaton, lowerCase, maxTokenLength);
}
public MockTokenizer(CharacterRunAutomaton runAutomaton, boolean lowerCase) {