LUCENE-5235: Tokenizers now throw an IllegalStateException if the consumer does not call reset() before consuming the stream. Previous versions throwed NullPointerException or ArrayIndexOutOfBoundsException on best effort which was not user-friendly.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1525362 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2013-09-22 13:57:43 +00:00
parent adba0da045
commit 34adebab3b
125 changed files with 458 additions and 381 deletions

View File

@ -74,6 +74,12 @@ New Features
* LUCENE-5219: Add support to SynonymFilterFactory for custom
parsers. (Ryan Ernst via Robert Muir)
* LUCENE-5235: Tokenizers now throw an IllegalStateException if the
consumer does not call reset() before consuming the stream. Previous
versions throwed NullPointerException or ArrayIndexOutOfBoundsException
on best effort which was not user-friendly.
(Uwe Schindler, Robert Muir)
Bug Fixes
* LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead
@ -94,6 +100,11 @@ Documentation
Changes in backwards compatibility policy
* LUCENE-5235: Sub classes of Tokenizer have to call super.reset()
when implementing reset(). Otherwise the consumer will get an
IllegalStateException because the Reader is not correctly assigned.
(Uwe Schindler, Robert Muir)
* LUCENE-5204: Directory doesn't have default implementations for
LockFactory-related methods, which have been moved to BaseDirectory. If you
had a custom Directory implementation that extended Directory, you need to

View File

@ -88,6 +88,7 @@ public final class KeywordTokenizer extends Tokenizer {
@Override
public void reset() throws IOException {
super.reset();
this.done = false;
}
}

View File

@ -140,7 +140,8 @@ public final class Lucene43NGramTokenizer extends Tokenizer {
}
@Override
public void end() {
public void end() throws IOException {
super.end();
// set final offset
final int finalOffset = correctOffset(charsRead);
this.offsetAtt.setOffset(finalOffset, finalOffset);

View File

@ -138,6 +138,7 @@ public final class PatternTokenizer extends Tokenizer {
@Override
public void reset() throws IOException {
super.reset();
fillBuffer(str, input);
matcher.reset(str);
index = 0;

View File

@ -114,7 +114,7 @@ public final class ClassicTokenizer extends Tokenizer {
}
private void init(Version matchVersion) {
this.scanner = new ClassicTokenizerImpl(null); // best effort NPE if you dont call reset
this.scanner = new ClassicTokenizerImpl(input);
}
// this tokenizer generates three attributes:
@ -170,9 +170,16 @@ public final class ClassicTokenizer extends Tokenizer {
// adjust any skipped tokens
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions);
}
@Override
public void close() throws IOException {
super.close();
scanner.yyreset(input);
}
@Override
public void reset() throws IOException {
super.reset();
scanner.yyreset(input);
skippedPositions = 0;
}

View File

@ -128,7 +128,7 @@ public final class StandardTokenizer extends Tokenizer {
}
private final void init(Version matchVersion) {
this.scanner = new StandardTokenizerImpl(null); // best effort NPE if you dont call reset
this.scanner = new StandardTokenizerImpl(input);
}
// this tokenizer generates three attributes:
@ -179,8 +179,15 @@ public final class StandardTokenizer extends Tokenizer {
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions);
}
@Override
public void close() throws IOException {
super.close();
scanner.yyreset(input);
}
@Override
public void reset() throws IOException {
super.reset();
scanner.yyreset(input);
skippedPositions = 0;
}

View File

@ -111,8 +111,8 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
this.scanner = getScannerFor(matchVersion);
}
private static StandardTokenizerInterface getScannerFor(Version matchVersion) {
return new UAX29URLEmailTokenizerImpl(null); // best effort NPE if you dont call reset
private StandardTokenizerInterface getScannerFor(Version matchVersion) {
return new UAX29URLEmailTokenizerImpl(input);
}
// this tokenizer generates three attributes:
@ -157,9 +157,16 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
// adjust any skipped tokens
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions);
}
@Override
public void close() throws IOException {
super.close();
scanner.yyreset(input);
}
@Override
public void reset() throws IOException {
super.reset();
scanner.yyreset(input);
skippedPositions = 0;
}

View File

@ -62,8 +62,7 @@ public abstract class CharTokenizer extends Tokenizer {
charUtils = CharacterUtils.getInstance(matchVersion);
}
// note: bufferIndex is -1 here to best-effort AIOOBE consumers that don't call reset()
private int offset = 0, bufferIndex = -1, dataLen = 0, finalOffset = 0;
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
private static final int MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 4096;
@ -150,6 +149,7 @@ public abstract class CharTokenizer extends Tokenizer {
@Override
public void reset() throws IOException {
super.reset();
bufferIndex = 0;
offset = 0;
dataLen = 0;

View File

@ -143,7 +143,7 @@ public final class WikipediaTokenizer extends Tokenizer {
*/
public WikipediaTokenizer(Reader input, int tokenOutput, Set<String> untokenizedTypes) {
super(input);
this.scanner = new WikipediaTokenizerImpl(null); // best effort NPE if you dont call reset
this.scanner = new WikipediaTokenizerImpl(this.input);
init(tokenOutput, untokenizedTypes);
}
@ -156,7 +156,7 @@ public final class WikipediaTokenizer extends Tokenizer {
*/
public WikipediaTokenizer(AttributeFactory factory, Reader input, int tokenOutput, Set<String> untokenizedTypes) {
super(factory, input);
this.scanner = new WikipediaTokenizerImpl(null); // best effort NPE if you dont call reset
this.scanner = new WikipediaTokenizerImpl(this.input);
init(tokenOutput, untokenizedTypes);
}
@ -295,6 +295,12 @@ public final class WikipediaTokenizer extends Tokenizer {
offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
}
@Override
public void close() throws IOException {
super.close();
scanner.yyreset(input);
}
/*
* (non-Javadoc)
*
@ -302,6 +308,7 @@ public final class WikipediaTokenizer extends Tokenizer {
*/
@Override
public void reset() throws IOException {
super.reset();
scanner.yyreset(input);
tokens = null;
scanner.reset();

View File

@ -60,8 +60,8 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
*/
public void testReusableTokenStream() throws Exception {
ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesToReuse(a, "كبير", new String[] { "كبير" });
assertAnalyzesToReuse(a, "كبيرة", new String[] { "كبير" }); // feminine marker
assertAnalyzesTo(a, "كبير", new String[] { "كبير" });
assertAnalyzesTo(a, "كبيرة", new String[] { "كبير" }); // feminine marker
}
/**
@ -86,12 +86,12 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, asSet("ساهدهات"), false);
ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
a = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, CharArraySet.EMPTY_SET);
assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
}
/** blast some random strings through the analyzer */

View File

@ -102,7 +102,7 @@ public class TestArabicNormalizationFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new ArabicNormalizationFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -142,6 +142,6 @@ public class TestArabicStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new ArabicStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -49,8 +49,8 @@ public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
public void testReusableTokenStream() throws IOException {
Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesToReuse(a, "документи", new String[] {"документ"});
assertAnalyzesToReuse(a, "документ", new String[] {"документ"});
assertAnalyzesTo(a, "документи", new String[] {"документ"});
assertAnalyzesTo(a, "документ", new String[] {"документ"});
}
/**

View File

@ -234,6 +234,6 @@ public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new BulgarianStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -157,7 +157,7 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
}
private void checkReuse(Analyzer a, String input, String expected) throws Exception {
checkOneTermReuse(a, input, expected);
checkOneTerm(a, input, expected);
}
/** blast some random strings through the analyzer */
@ -173,6 +173,6 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new BrazilianStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -34,8 +34,8 @@ public class TestCatalanAnalyzer extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
Analyzer a = new CatalanAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "llengües", "llengu");
checkOneTermReuse(a, "llengua", "llengu");
checkOneTerm(a, "llengües", "llengu");
checkOneTerm(a, "llengua", "llengu");
// stopword
assertAnalyzesTo(a, "un", new String[] { });
}
@ -52,8 +52,8 @@ public class TestCatalanAnalyzer extends BaseTokenStreamTestCase {
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("llengües"), false);
Analyzer a = new CatalanAnalyzer(TEST_VERSION_CURRENT,
CatalanAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "llengües", "llengües");
checkOneTermReuse(a, "llengua", "llengu");
checkOneTerm(a, "llengües", "llengües");
checkOneTerm(a, "llengua", "llengu");
}
/** blast some random strings through the analyzer */

View File

@ -167,14 +167,14 @@ public class TestCJKAnalyzer extends BaseTokenStreamTestCase {
}
public void testReusableTokenStream() throws IOException {
assertAnalyzesToReuse(analyzer, "あいうえおabcかきくけこ",
assertAnalyzesTo(analyzer, "あいうえおabcかきくけこ",
new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" },
new int[] { 0, 1, 2, 3, 5, 8, 9, 10, 11 },
new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 },
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1});
assertAnalyzesToReuse(analyzer, "あいうえおabんcかきくけ こ",
assertAnalyzesTo(analyzer, "あいうえおabんcかきくけ こ",
new String[] { "あい", "いう", "うえ", "えお", "ab", "", "c", "かき", "きく", "くけ", "" },
new int[] { 0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 14 },
new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 },
@ -288,6 +288,6 @@ public class TestCJKAnalyzer extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new CJKBigramFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -74,6 +74,6 @@ public class TestCJKWidthFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new CJKWidthFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -377,7 +377,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
@ -390,6 +390,6 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, filter);
}
};
checkOneTermReuse(b, "", "");
checkOneTerm(b, "", "");
}
}

View File

@ -39,8 +39,8 @@ public class TestCzechAnalyzer extends BaseTokenStreamTestCase {
public void testReusableTokenStream() throws Exception {
Analyzer analyzer = new CzechAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvim", "voln" });
assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česk", "republik" });
assertAnalyzesTo(analyzer, "Pokud mluvime o volnem", new String[] { "mluvim", "voln" });
assertAnalyzesTo(analyzer, "Česká Republika", new String[] { "česk", "republik" });
}
public void testWithStemExclusionSet() throws IOException{

View File

@ -294,7 +294,7 @@ public class TestCzechStemmer extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new CzechStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -34,8 +34,8 @@ public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
Analyzer a = new DanishAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "undersøg", "undersøg");
checkOneTermReuse(a, "undersøgelse", "undersøg");
checkOneTerm(a, "undersøg", "undersøg");
checkOneTerm(a, "undersøgelse", "undersøg");
// stopword
assertAnalyzesTo(a, "", new String[] {});
}
@ -45,8 +45,8 @@ public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("undersøgelse"), false);
Analyzer a = new DanishAnalyzer(TEST_VERSION_CURRENT,
DanishAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "undersøgelse", "undersøgelse");
checkOneTermReuse(a, "undersøg", "undersøg");
checkOneTerm(a, "undersøgelse", "undersøgelse");
checkOneTerm(a, "undersøg", "undersøg");
}
/** blast some random strings through the analyzer */

View File

@ -29,9 +29,9 @@ import org.apache.lucene.analysis.util.CharArraySet;
public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
public void testReusableTokenStream() throws Exception {
Analyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT);
checkOneTermReuse(a, "Tisch", "tisch");
checkOneTermReuse(a, "Tische", "tisch");
checkOneTermReuse(a, "Tischen", "tisch");
checkOneTerm(a, "Tisch", "tisch");
checkOneTerm(a, "Tische", "tisch");
checkOneTerm(a, "Tischen", "tisch");
}
public void testWithKeywordAttribute() throws IOException {
@ -46,7 +46,7 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
public void testStemExclusionTable() throws Exception {
GermanAnalyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET,
new CharArraySet(TEST_VERSION_CURRENT, asSet("tischen"), false));
checkOneTermReuse(a, "tischen", "tischen");
checkOneTerm(a, "tischen", "tischen");
}
/** test some features of the new snowball filter
@ -55,8 +55,8 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
public void testGermanSpecials() throws Exception {
GermanAnalyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT);
// a/o/u + e is equivalent to the umlaut form
checkOneTermReuse(a, "Schaltflächen", "schaltflach");
checkOneTermReuse(a, "Schaltflaechen", "schaltflach");
checkOneTerm(a, "Schaltflächen", "schaltflach");
checkOneTerm(a, "Schaltflaechen", "schaltflach");
}
/** blast some random strings through the analyzer */

View File

@ -75,6 +75,6 @@ public class TestGermanLightStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new GermanLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -87,6 +87,6 @@ public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new GermanMinimalStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -75,6 +75,6 @@ public class TestGermanNormalizationFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new GermanNormalizationFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -88,6 +88,6 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new GermanStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -51,16 +51,16 @@ public class GreekAnalyzerTest extends BaseTokenStreamTestCase {
Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
// Verify the correct analysis of capitals and small accented letters, and
// stemming
assertAnalyzesToReuse(a, "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας",
assertAnalyzesTo(a, "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας",
new String[] { "μια", "εξαιρετ", "καλ", "πλουσ", "σειρ", "χαρακτηρ",
"ελληνικ", "γλωσσ" });
// Verify the correct analysis of small letters with diaeresis and the elimination
// of punctuation marks
assertAnalyzesToReuse(a, "Προϊόντα (και) [πολλαπλές] - ΑΝΑΓΚΕΣ",
assertAnalyzesTo(a, "Προϊόντα (και) [πολλαπλές] - ΑΝΑΓΚΕΣ",
new String[] { "προιοντ", "πολλαπλ", "αναγκ" });
// Verify the correct analysis of capital accented letters and capital letters with diaeresis,
// as well as the elimination of stop words
assertAnalyzesToReuse(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
assertAnalyzesTo(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
new String[] { "προυποθεσ", "αψογ", "μεστ", "αλλ" });
}

View File

@ -536,6 +536,6 @@ public class TestGreekStemmer extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new GreekStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -34,14 +34,14 @@ public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
Analyzer a = new EnglishAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "books", "book");
checkOneTermReuse(a, "book", "book");
checkOneTerm(a, "books", "book");
checkOneTerm(a, "book", "book");
// stopword
assertAnalyzesTo(a, "the", new String[] {});
// possessive removal
checkOneTermReuse(a, "steven's", "steven");
checkOneTermReuse(a, "steven\u2019s", "steven");
checkOneTermReuse(a, "steven\uFF07s", "steven");
checkOneTerm(a, "steven's", "steven");
checkOneTerm(a, "steven\u2019s", "steven");
checkOneTerm(a, "steven\uFF07s", "steven");
}
/** test use of exclusion set */
@ -49,8 +49,8 @@ public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("books"), false);
Analyzer a = new EnglishAnalyzer(TEST_VERSION_CURRENT,
EnglishAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "books", "books");
checkOneTermReuse(a, "book", "book");
checkOneTerm(a, "books", "books");
checkOneTerm(a, "book", "book");
}
/** blast some random strings through the analyzer */

View File

@ -65,6 +65,6 @@ public class TestEnglishMinimalStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new EnglishMinimalStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -62,7 +62,7 @@ public class TestKStemmer extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new KStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
/****** requires original java kstem source code to create map

View File

@ -74,6 +74,6 @@ public class TestPorterStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new PorterStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -34,8 +34,8 @@ public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
Analyzer a = new SpanishAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "chicana", "chican");
checkOneTermReuse(a, "chicano", "chican");
checkOneTerm(a, "chicana", "chican");
checkOneTerm(a, "chicano", "chican");
// stopword
assertAnalyzesTo(a, "los", new String[] {});
}
@ -45,8 +45,8 @@ public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("chicano"), false);
Analyzer a = new SpanishAnalyzer(TEST_VERSION_CURRENT,
SpanishAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "chicana", "chican");
checkOneTermReuse(a, "chicano", "chicano");
checkOneTerm(a, "chicana", "chican");
checkOneTerm(a, "chicano", "chicano");
}
/** blast some random strings through the analyzer */

View File

@ -59,6 +59,6 @@ public class TestSpanishLightStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new SpanishLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -34,8 +34,8 @@ public class TestBasqueAnalyzer extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
Analyzer a = new BasqueAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "zaldi", "zaldi");
checkOneTermReuse(a, "zaldiak", "zaldi");
checkOneTerm(a, "zaldi", "zaldi");
checkOneTerm(a, "zaldiak", "zaldi");
// stopword
assertAnalyzesTo(a, "izan", new String[] { });
}
@ -45,8 +45,8 @@ public class TestBasqueAnalyzer extends BaseTokenStreamTestCase {
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("zaldiak"), false);
Analyzer a = new BasqueAnalyzer(TEST_VERSION_CURRENT,
BasqueAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "zaldiak", "zaldiak");
checkOneTermReuse(a, "mendiari", "mendi");
checkOneTerm(a, "zaldiak", "zaldiak");
checkOneTerm(a, "mendiari", "mendi");
}
/** blast some random strings through the analyzer */

View File

@ -208,8 +208,8 @@ public class TestPersianAnalyzer extends BaseTokenStreamTestCase {
*/
public void testReusableTokenStream() throws Exception {
Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesToReuse(a, "خورده مي شده بوده باشد", new String[] { "خورده" });
assertAnalyzesToReuse(a, "برگ‌ها", new String[] { "برگ" });
assertAnalyzesTo(a, "خورده مي شده بوده باشد", new String[] { "خورده" });
assertAnalyzesTo(a, "برگ‌ها", new String[] { "برگ" });
}
/**

View File

@ -72,7 +72,7 @@ public class TestPersianNormalizationFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new PersianNormalizationFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -34,8 +34,8 @@ public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
Analyzer a = new FinnishAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
checkOneTermReuse(a, "edeltäjistään", "edeltäj");
checkOneTerm(a, "edeltäjiinsä", "edeltäj");
checkOneTerm(a, "edeltäjistään", "edeltäj");
// stopword
assertAnalyzesTo(a, "olla", new String[] {});
}
@ -45,8 +45,8 @@ public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("edeltäjistään"), false);
Analyzer a = new FinnishAnalyzer(TEST_VERSION_CURRENT,
FinnishAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
checkOneTermReuse(a, "edeltäjistään", "edeltäjistään");
checkOneTerm(a, "edeltäjiinsä", "edeltäj");
checkOneTerm(a, "edeltäjistään", "edeltäjistään");
}
/** blast some random strings through the analyzer */

View File

@ -75,6 +75,6 @@ public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new FinnishLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -117,13 +117,13 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
public void testReusableTokenStream() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
// stopwords
assertAnalyzesToReuse(
assertAnalyzesTo(
fa,
"le la chien les aux chat du des à cheval",
new String[] { "chien", "chat", "cheval" });
// some nouns and adjectives
assertAnalyzesToReuse(
assertAnalyzesTo(
fa,
"lances chismes habitable chiste éléments captifs",
new String[] {
@ -140,7 +140,7 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
set.add("habitable");
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT,
CharArraySet.EMPTY_SET, set);
assertAnalyzesToReuse(fa, "habitable chiste", new String[] { "habitable",
assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
"chist" });
fa = new FrenchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
@ -169,7 +169,7 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
/** test accent-insensitive */
public void testAccentInsensitive() throws Exception {
Analyzer a = new FrenchAnalyzer(TEST_VERSION_CURRENT);
checkOneTermReuse(a, "sécuritaires", "securitair");
checkOneTermReuse(a, "securitaires", "securitair");
checkOneTerm(a, "sécuritaires", "securitair");
checkOneTerm(a, "securitaires", "securitair");
}
}

View File

@ -205,6 +205,6 @@ public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new FrenchLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -89,6 +89,6 @@ public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new FrenchMinimalStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -34,8 +34,8 @@ public class TestIrishAnalyzer extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
Analyzer a = new IrishAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "siopadóireacht", "siopadóir");
checkOneTermReuse(a, "síceapatacha", "síceapaite");
checkOneTerm(a, "siopadóireacht", "siopadóir");
checkOneTerm(a, "síceapatacha", "síceapaite");
// stopword
assertAnalyzesTo(a, "le", new String[] { });
}
@ -52,8 +52,8 @@ public class TestIrishAnalyzer extends BaseTokenStreamTestCase {
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("feirmeoireacht"), false);
Analyzer a = new IrishAnalyzer(TEST_VERSION_CURRENT,
IrishAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "feirmeoireacht", "feirmeoireacht");
checkOneTermReuse(a, "siopadóireacht", "siopadóir");
checkOneTerm(a, "feirmeoireacht", "feirmeoireacht");
checkOneTerm(a, "siopadóireacht", "siopadóir");
}
/** test special hyphen handling */

View File

@ -52,6 +52,6 @@ public class TestIrishLowerCaseFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new IrishLowerCaseFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -34,8 +34,8 @@ public class TestGalicianAnalyzer extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
Analyzer a = new GalicianAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "correspondente", "correspond");
checkOneTermReuse(a, "corresponderá", "correspond");
checkOneTerm(a, "correspondente", "correspond");
checkOneTerm(a, "corresponderá", "correspond");
// stopword
assertAnalyzesTo(a, "e", new String[] {});
}
@ -45,8 +45,8 @@ public class TestGalicianAnalyzer extends BaseTokenStreamTestCase {
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("correspondente"), false);
Analyzer a = new GalicianAnalyzer(TEST_VERSION_CURRENT,
GalicianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "correspondente", "correspondente");
checkOneTermReuse(a, "corresponderá", "correspond");
checkOneTerm(a, "correspondente", "correspondente");
checkOneTerm(a, "corresponderá", "correspond");
}
/** blast some random strings through the analyzer */

View File

@ -79,6 +79,6 @@ public class TestGalicianMinimalStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new GalicianMinimalStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -58,6 +58,6 @@ public class TestGalicianStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new GalicianStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -34,15 +34,15 @@ public class TestHindiAnalyzer extends BaseTokenStreamTestCase {
public void testBasics() throws Exception {
Analyzer a = new HindiAnalyzer(TEST_VERSION_CURRENT);
// two ways to write 'hindi' itself.
checkOneTermReuse(a, "हिन्दी", "हिंद");
checkOneTermReuse(a, "हिंदी", "हिंद");
checkOneTerm(a, "हिन्दी", "हिंद");
checkOneTerm(a, "हिंदी", "हिंद");
}
public void testExclusionSet() throws Exception {
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("हिंदी"), false);
Analyzer a = new HindiAnalyzer(TEST_VERSION_CURRENT,
HindiAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "हिंदी", "हिंदी");
checkOneTerm(a, "हिंदी", "हिंदी");
}
/** blast some random strings through the analyzer */

View File

@ -75,6 +75,6 @@ public class TestHindiNormalizer extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new HindiNormalizationFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -97,6 +97,6 @@ public class TestHindiStemmer extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new HindiStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -34,8 +34,8 @@ public class TestHungarianAnalyzer extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
Analyzer a = new HungarianAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "babakocsi", "babakocs");
checkOneTermReuse(a, "babakocsijáért", "babakocs");
checkOneTerm(a, "babakocsi", "babakocs");
checkOneTerm(a, "babakocsijáért", "babakocs");
// stopword
assertAnalyzesTo(a, "által", new String[] {});
}
@ -45,8 +45,8 @@ public class TestHungarianAnalyzer extends BaseTokenStreamTestCase {
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("babakocsi"), false);
Analyzer a = new HungarianAnalyzer(TEST_VERSION_CURRENT,
HungarianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "babakocsi", "babakocsi");
checkOneTermReuse(a, "babakocsijáért", "babakocs");
checkOneTerm(a, "babakocsi", "babakocsi");
checkOneTerm(a, "babakocsijáért", "babakocs");
}
/** blast some random strings through the analyzer */

View File

@ -70,6 +70,6 @@ public class TestHungarianLightStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new HungarianLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -89,6 +89,6 @@ public class HunspellStemFilterTest extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, _TestUtil.nextInt(random(), 1, 3)));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -34,8 +34,8 @@ public class TestArmenianAnalyzer extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
Analyzer a = new ArmenianAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "արծիվ", "արծ");
checkOneTermReuse(a, "արծիվներ", "արծ");
checkOneTerm(a, "արծիվ", "արծ");
checkOneTerm(a, "արծիվներ", "արծ");
// stopword
assertAnalyzesTo(a, "է", new String[] { });
}
@ -45,8 +45,8 @@ public class TestArmenianAnalyzer extends BaseTokenStreamTestCase {
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("արծիվներ"), false);
Analyzer a = new ArmenianAnalyzer(TEST_VERSION_CURRENT,
ArmenianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "արծիվներ", "արծիվներ");
checkOneTermReuse(a, "արծիվ", "արծ");
checkOneTerm(a, "արծիվներ", "արծիվներ");
checkOneTerm(a, "արծիվ", "արծ");
}
/** blast some random strings through the analyzer */

View File

@ -34,8 +34,8 @@ public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "peledakan", "ledak");
checkOneTermReuse(a, "pembunuhan", "bunuh");
checkOneTerm(a, "peledakan", "ledak");
checkOneTerm(a, "pembunuhan", "bunuh");
// stopword
assertAnalyzesTo(a, "bahwa", new String[] {});
}
@ -45,8 +45,8 @@ public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase {
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("peledakan"), false);
Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT,
IndonesianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "peledakan", "peledakan");
checkOneTermReuse(a, "pembunuhan", "bunuh");
checkOneTerm(a, "peledakan", "peledakan");
checkOneTerm(a, "pembunuhan", "bunuh");
}
/** blast some random strings through the analyzer */

View File

@ -41,73 +41,73 @@ public class TestIndonesianStemmer extends BaseTokenStreamTestCase {
/** Some examples from the paper */
public void testExamples() throws IOException {
checkOneTerm(a, "bukukah", "buku");
checkOneTermReuse(a, "adalah", "ada");
checkOneTermReuse(a, "bukupun", "buku");
checkOneTermReuse(a, "bukuku", "buku");
checkOneTermReuse(a, "bukumu", "buku");
checkOneTermReuse(a, "bukunya", "buku");
checkOneTermReuse(a, "mengukur", "ukur");
checkOneTermReuse(a, "menyapu", "sapu");
checkOneTermReuse(a, "menduga", "duga");
checkOneTermReuse(a, "menuduh", "uduh");
checkOneTermReuse(a, "membaca", "baca");
checkOneTermReuse(a, "merusak", "rusak");
checkOneTermReuse(a, "pengukur", "ukur");
checkOneTermReuse(a, "penyapu", "sapu");
checkOneTermReuse(a, "penduga", "duga");
checkOneTermReuse(a, "pembaca", "baca");
checkOneTermReuse(a, "diukur", "ukur");
checkOneTermReuse(a, "tersapu", "sapu");
checkOneTermReuse(a, "kekasih", "kasih");
checkOneTermReuse(a, "berlari", "lari");
checkOneTermReuse(a, "belajar", "ajar");
checkOneTermReuse(a, "bekerja", "kerja");
checkOneTermReuse(a, "perjelas", "jelas");
checkOneTermReuse(a, "pelajar", "ajar");
checkOneTermReuse(a, "pekerja", "kerja");
checkOneTermReuse(a, "tarikkan", "tarik");
checkOneTermReuse(a, "ambilkan", "ambil");
checkOneTermReuse(a, "mengambilkan", "ambil");
checkOneTermReuse(a, "makanan", "makan");
checkOneTermReuse(a, "janjian", "janji");
checkOneTermReuse(a, "perjanjian", "janji");
checkOneTermReuse(a, "tandai", "tanda");
checkOneTermReuse(a, "dapati", "dapat");
checkOneTermReuse(a, "mendapati", "dapat");
checkOneTermReuse(a, "pantai", "panta");
checkOneTerm(a, "adalah", "ada");
checkOneTerm(a, "bukupun", "buku");
checkOneTerm(a, "bukuku", "buku");
checkOneTerm(a, "bukumu", "buku");
checkOneTerm(a, "bukunya", "buku");
checkOneTerm(a, "mengukur", "ukur");
checkOneTerm(a, "menyapu", "sapu");
checkOneTerm(a, "menduga", "duga");
checkOneTerm(a, "menuduh", "uduh");
checkOneTerm(a, "membaca", "baca");
checkOneTerm(a, "merusak", "rusak");
checkOneTerm(a, "pengukur", "ukur");
checkOneTerm(a, "penyapu", "sapu");
checkOneTerm(a, "penduga", "duga");
checkOneTerm(a, "pembaca", "baca");
checkOneTerm(a, "diukur", "ukur");
checkOneTerm(a, "tersapu", "sapu");
checkOneTerm(a, "kekasih", "kasih");
checkOneTerm(a, "berlari", "lari");
checkOneTerm(a, "belajar", "ajar");
checkOneTerm(a, "bekerja", "kerja");
checkOneTerm(a, "perjelas", "jelas");
checkOneTerm(a, "pelajar", "ajar");
checkOneTerm(a, "pekerja", "kerja");
checkOneTerm(a, "tarikkan", "tarik");
checkOneTerm(a, "ambilkan", "ambil");
checkOneTerm(a, "mengambilkan", "ambil");
checkOneTerm(a, "makanan", "makan");
checkOneTerm(a, "janjian", "janji");
checkOneTerm(a, "perjanjian", "janji");
checkOneTerm(a, "tandai", "tanda");
checkOneTerm(a, "dapati", "dapat");
checkOneTerm(a, "mendapati", "dapat");
checkOneTerm(a, "pantai", "panta");
}
/** Some detailed analysis examples (that might not be the best) */
public void testIRExamples() throws IOException {
checkOneTerm(a, "penyalahgunaan", "salahguna");
checkOneTermReuse(a, "menyalahgunakan", "salahguna");
checkOneTermReuse(a, "disalahgunakan", "salahguna");
checkOneTerm(a, "menyalahgunakan", "salahguna");
checkOneTerm(a, "disalahgunakan", "salahguna");
checkOneTermReuse(a, "pertanggungjawaban", "tanggungjawab");
checkOneTermReuse(a, "mempertanggungjawabkan", "tanggungjawab");
checkOneTermReuse(a, "dipertanggungjawabkan", "tanggungjawab");
checkOneTerm(a, "pertanggungjawaban", "tanggungjawab");
checkOneTerm(a, "mempertanggungjawabkan", "tanggungjawab");
checkOneTerm(a, "dipertanggungjawabkan", "tanggungjawab");
checkOneTermReuse(a, "pelaksanaan", "laksana");
checkOneTermReuse(a, "pelaksana", "laksana");
checkOneTermReuse(a, "melaksanakan", "laksana");
checkOneTermReuse(a, "dilaksanakan", "laksana");
checkOneTerm(a, "pelaksanaan", "laksana");
checkOneTerm(a, "pelaksana", "laksana");
checkOneTerm(a, "melaksanakan", "laksana");
checkOneTerm(a, "dilaksanakan", "laksana");
checkOneTermReuse(a, "melibatkan", "libat");
checkOneTermReuse(a, "terlibat", "libat");
checkOneTerm(a, "melibatkan", "libat");
checkOneTerm(a, "terlibat", "libat");
checkOneTermReuse(a, "penculikan", "culik");
checkOneTermReuse(a, "menculik", "culik");
checkOneTermReuse(a, "diculik", "culik");
checkOneTermReuse(a, "penculik", "culik");
checkOneTerm(a, "penculikan", "culik");
checkOneTerm(a, "menculik", "culik");
checkOneTerm(a, "diculik", "culik");
checkOneTerm(a, "penculik", "culik");
checkOneTermReuse(a, "perubahan", "ubah");
checkOneTermReuse(a, "peledakan", "ledak");
checkOneTermReuse(a, "penanganan", "tangan");
checkOneTermReuse(a, "kepolisian", "polisi");
checkOneTermReuse(a, "kenaikan", "naik");
checkOneTermReuse(a, "bersenjata", "senjata");
checkOneTermReuse(a, "penyelewengan", "seleweng");
checkOneTermReuse(a, "kecelakaan", "celaka");
checkOneTerm(a, "perubahan", "ubah");
checkOneTerm(a, "peledakan", "ledak");
checkOneTerm(a, "penanganan", "tangan");
checkOneTerm(a, "kepolisian", "polisi");
checkOneTerm(a, "kenaikan", "naik");
checkOneTerm(a, "bersenjata", "senjata");
checkOneTerm(a, "penyelewengan", "seleweng");
checkOneTerm(a, "kecelakaan", "celaka");
}
/* inflectional-only stemming */
@ -122,15 +122,15 @@ public class TestIndonesianStemmer extends BaseTokenStreamTestCase {
/** Test stemming only inflectional suffixes */
public void testInflectionalOnly() throws IOException {
checkOneTerm(b, "bukunya", "buku");
checkOneTermReuse(b, "bukukah", "buku");
checkOneTermReuse(b, "bukunyakah", "buku");
checkOneTermReuse(b, "dibukukannya", "dibukukan");
checkOneTerm(b, "bukukah", "buku");
checkOneTerm(b, "bukunyakah", "buku");
checkOneTerm(b, "dibukukannya", "dibukukan");
}
public void testShouldntStem() throws IOException {
checkOneTerm(a, "bersenjata", "senjata");
checkOneTermReuse(a, "bukukah", "buku");
checkOneTermReuse(a, "gigi", "gigi");
checkOneTerm(a, "bukukah", "buku");
checkOneTerm(a, "gigi", "gigi");
}
public void testEmptyTerm() throws IOException {
@ -141,6 +141,6 @@ public class TestIndonesianStemmer extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -60,6 +60,6 @@ public class TestIndicNormalizer extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new IndicNormalizationFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -37,8 +37,8 @@ public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "abbandonata", "abbandonat");
checkOneTermReuse(a, "abbandonati", "abbandonat");
checkOneTerm(a, "abbandonata", "abbandonat");
checkOneTerm(a, "abbandonati", "abbandonat");
// stopword
assertAnalyzesTo(a, "dallo", new String[] {});
}
@ -48,8 +48,8 @@ public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("abbandonata"), false);
Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT,
ItalianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "abbandonata", "abbandonata");
checkOneTermReuse(a, "abbandonati", "abbandonat");
checkOneTerm(a, "abbandonata", "abbandonata");
checkOneTerm(a, "abbandonati", "abbandonat");
}
/** blast some random strings through the analyzer */

View File

@ -59,6 +59,6 @@ public class TestItalianLightStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new ItalianLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -34,8 +34,8 @@ public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "tirgiem", "tirg");
checkOneTermReuse(a, "tirgus", "tirg");
checkOneTerm(a, "tirgiem", "tirg");
checkOneTerm(a, "tirgus", "tirg");
// stopword
assertAnalyzesTo(a, "un", new String[] {});
}
@ -45,8 +45,8 @@ public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("tirgiem"), false);
Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT,
LatvianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "tirgiem", "tirgiem");
checkOneTermReuse(a, "tirgus", "tirg");
checkOneTerm(a, "tirgiem", "tirgiem");
checkOneTerm(a, "tirgus", "tirg");
}
/** blast some random strings through the analyzer */

View File

@ -278,6 +278,6 @@ public class TestLatvianStemmer extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new LatvianStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -1934,6 +1934,6 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new ASCIIFoldingFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -143,6 +143,6 @@ public class TestCapitalizationFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new CapitalizationFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -85,6 +85,6 @@ public class TestHyphenatedWordsFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new HyphenatedWordsFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -48,7 +48,7 @@ public class TestLengthFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new LengthFilter(TEST_VERSION_CURRENT, tokenizer, 0, 5));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -175,7 +175,7 @@ public class TestRemoveDuplicatesTokenFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -115,6 +115,6 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new TrimFilter(version, tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -197,7 +197,6 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q"));
tk = new ShingleFilter(tk);
tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10);
tk.reset();
assertTokenStreamContents(tk,
new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
new int[] { 6,11,11,14 },

View File

@ -115,24 +115,24 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
public void testSnowballCorrectness() throws Exception {
Analyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
checkOneTermReuse(a, "opheffen", "opheff");
checkOneTermReuse(a, "opheffende", "opheff");
checkOneTermReuse(a, "opheffing", "opheff");
checkOneTerm(a, "opheffen", "opheff");
checkOneTerm(a, "opheffende", "opheff");
checkOneTerm(a, "opheffing", "opheff");
}
public void testReusableTokenStream() throws Exception {
Analyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
checkOneTermReuse(a, "lichaamsziek", "lichaamsziek");
checkOneTermReuse(a, "lichamelijk", "licham");
checkOneTermReuse(a, "lichamelijke", "licham");
checkOneTermReuse(a, "lichamelijkheden", "licham");
checkOneTerm(a, "lichaamsziek", "lichaamsziek");
checkOneTerm(a, "lichamelijk", "licham");
checkOneTerm(a, "lichamelijke", "licham");
checkOneTerm(a, "lichamelijkheden", "licham");
}
public void testExclusionTableViaCtor() throws IOException {
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
set.add("lichamelijk");
DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
assertAnalyzesToReuse(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
assertAnalyzesTo(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
assertAnalyzesTo(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });

View File

@ -34,8 +34,8 @@ public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
Analyzer a = new NorwegianAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "havnedistriktene", "havnedistrikt");
checkOneTermReuse(a, "havnedistrikter", "havnedistrikt");
checkOneTerm(a, "havnedistriktene", "havnedistrikt");
checkOneTerm(a, "havnedistrikter", "havnedistrikt");
// stopword
assertAnalyzesTo(a, "det", new String[] {});
}
@ -45,8 +45,8 @@ public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase {
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("havnedistriktene"), false);
Analyzer a = new NorwegianAnalyzer(TEST_VERSION_CURRENT,
NorwegianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "havnedistriktene", "havnedistriktene");
checkOneTermReuse(a, "havnedistrikter", "havnedistrikt");
checkOneTerm(a, "havnedistriktene", "havnedistriktene");
checkOneTerm(a, "havnedistrikter", "havnedistrikt");
}
/** blast some random strings through the analyzer */

View File

@ -93,6 +93,6 @@ public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new NorwegianLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -92,6 +92,6 @@ public class TestNorwegianMinimalStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new NorwegianMinimalStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -114,7 +114,7 @@ public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new PatternReplaceFilter(tokenizer, Pattern.compile("a"), "b", true));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -34,6 +34,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
filter.reset();
assertTermEquals("The", filter, termAtt, payAtt, null);
assertTermEquals("quick", filter, termAtt, payAtt, "JJ".getBytes("UTF-8"));
assertTermEquals("red", filter, termAtt, payAtt, "JJ".getBytes("UTF-8"));
@ -45,6 +46,8 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
assertTermEquals("brown", filter, termAtt, payAtt, "JJ".getBytes("UTF-8"));
assertTermEquals("dogs", filter, termAtt, payAtt, "NN".getBytes("UTF-8"));
assertFalse(filter.incrementToken());
filter.end();
filter.close();
}
public void testNext() throws Exception {
@ -53,6 +56,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter
(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false),
DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
filter.reset();
assertTermEquals("The", filter, null);
assertTermEquals("quick", filter, "JJ".getBytes("UTF-8"));
assertTermEquals("red", filter, "JJ".getBytes("UTF-8"));
@ -64,6 +68,8 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
assertTermEquals("brown", filter, "JJ".getBytes("UTF-8"));
assertTermEquals("dogs", filter, "NN".getBytes("UTF-8"));
assertFalse(filter.incrementToken());
filter.end();
filter.close();
}
@ -72,6 +78,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false), '|', new FloatEncoder());
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
filter.reset();
assertTermEquals("The", filter, termAtt, payAtt, null);
assertTermEquals("quick", filter, termAtt, payAtt, PayloadHelper.encodeFloat(1.0f));
assertTermEquals("red", filter, termAtt, payAtt, PayloadHelper.encodeFloat(2.0f));
@ -83,6 +90,8 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
assertTermEquals("brown", filter, termAtt, payAtt, PayloadHelper.encodeFloat(99.3f));
assertTermEquals("dogs", filter, termAtt, payAtt, PayloadHelper.encodeFloat(83.7f));
assertFalse(filter.incrementToken());
filter.end();
filter.close();
}
public void testIntEncoding() throws Exception {
@ -90,6 +99,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false), '|', new IntegerEncoder());
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
filter.reset();
assertTermEquals("The", filter, termAtt, payAtt, null);
assertTermEquals("quick", filter, termAtt, payAtt, PayloadHelper.encodeInt(1));
assertTermEquals("red", filter, termAtt, payAtt, PayloadHelper.encodeInt(2));
@ -101,12 +111,13 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
assertTermEquals("brown", filter, termAtt, payAtt, PayloadHelper.encodeInt(99));
assertTermEquals("dogs", filter, termAtt, payAtt, PayloadHelper.encodeInt(83));
assertFalse(filter.incrementToken());
filter.end();
filter.close();
}
void assertTermEquals(String expected, TokenStream stream, byte[] expectPay) throws Exception {
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
PayloadAttribute payloadAtt = stream.getAttribute(PayloadAttribute.class);
stream.reset();
assertTrue(stream.incrementToken());
assertEquals(expected, termAtt.toString());
BytesRef payload = payloadAtt.getPayload();
@ -123,7 +134,6 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, PayloadAttribute payAtt, byte[] expectPay) throws Exception {
stream.reset();
assertTrue(stream.incrementToken());
assertEquals(expected, termAtt.toString());
BytesRef payload = payAtt.getPayload();

View File

@ -34,8 +34,8 @@ public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
Analyzer a = new PortugueseAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "quilométricas", "quilometric");
checkOneTermReuse(a, "quilométricos", "quilometric");
checkOneTerm(a, "quilométricas", "quilometric");
checkOneTerm(a, "quilométricos", "quilometric");
// stopword
assertAnalyzesTo(a, "não", new String[] {});
}
@ -45,8 +45,8 @@ public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase {
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false);
Analyzer a = new PortugueseAnalyzer(TEST_VERSION_CURRENT,
PortugueseAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "quilométricas", "quilométricas");
checkOneTermReuse(a, "quilométricos", "quilometric");
checkOneTerm(a, "quilométricas", "quilométricas");
checkOneTerm(a, "quilométricos", "quilometric");
}
/** blast some random strings through the analyzer */

View File

@ -123,6 +123,6 @@ public class TestPortugueseLightStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new PortugueseLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -97,6 +97,6 @@ public class TestPortugueseMinimalStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new PortugueseMinimalStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -96,6 +96,6 @@ public class TestPortugueseStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new PortugueseStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -113,6 +113,6 @@ public class TestReverseStringFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(TEST_VERSION_CURRENT, tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -34,8 +34,8 @@ public class TestRomanianAnalyzer extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
Analyzer a = new RomanianAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "absenţa", "absenţ");
checkOneTermReuse(a, "absenţi", "absenţ");
checkOneTerm(a, "absenţa", "absenţ");
checkOneTerm(a, "absenţi", "absenţ");
// stopword
assertAnalyzesTo(a, "îl", new String[] {});
}
@ -45,8 +45,8 @@ public class TestRomanianAnalyzer extends BaseTokenStreamTestCase {
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("absenţa"), false);
Analyzer a = new RomanianAnalyzer(TEST_VERSION_CURRENT,
RomanianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "absenţa", "absenţa");
checkOneTermReuse(a, "absenţi", "absenţ");
checkOneTerm(a, "absenţa", "absenţa");
checkOneTerm(a, "absenţi", "absenţ");
}
/** blast some random strings through the analyzer */

View File

@ -39,9 +39,9 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase {
public void testReusableTokenStream() throws Exception {
Analyzer a = new RussianAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
assertAnalyzesTo(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
assertAnalyzesTo(a, "Но знание это хранилось в тайне",
new String[] { "знан", "эт", "хран", "тайн" });
}
@ -50,7 +50,7 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase {
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
set.add("представление");
Analyzer a = new RussianAnalyzer(TEST_VERSION_CURRENT, RussianAnalyzer.getDefaultStopSet() , set);
assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
assertAnalyzesTo(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представление" });
}

View File

@ -75,6 +75,6 @@ public class TestRussianLightStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new RussianLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -140,12 +140,12 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
public void testReusableTokenStream() throws Exception {
Analyzer a = new ShingleAnalyzerWrapper(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 2);
assertAnalyzesToReuse(a, "please divide into shingles",
assertAnalyzesTo(a, "please divide into shingles",
new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" },
new int[] { 0, 0, 7, 7, 14, 14, 19 },
new int[] { 6, 13, 13, 18, 18, 27, 27 },
new int[] { 1, 0, 1, 0, 1, 0, 1 });
assertAnalyzesToReuse(a, "divide me up again",
assertAnalyzesTo(a, "divide me up again",
new String[] { "divide", "divide me", "me", "me up", "up", "up again", "again" },
new int[] { 0, 0, 7, 7, 10, 10, 13 },
new int[] { 6, 9, 9, 12, 12, 18, 18 },
@ -155,7 +155,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
public void testNonDefaultMinShingleSize() throws Exception {
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4);
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
new String[] { "please", "please divide this", "please divide this sentence",
"divide", "divide this sentence", "divide this sentence into",
"this", "this sentence into", "this sentence into shingles",
@ -168,7 +168,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4, ShingleFilter.TOKEN_SEPARATOR, false, false);
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
new String[] { "please divide this", "please divide this sentence",
"divide this sentence", "divide this sentence into",
"this sentence into", "this sentence into shingles",
@ -181,7 +181,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
public void testNonDefaultMinAndSameMaxShingleSize() throws Exception {
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3);
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
new String[] { "please", "please divide this",
"divide", "divide this sentence",
"this", "this sentence into",
@ -194,7 +194,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3, ShingleFilter.TOKEN_SEPARATOR, false, false);
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
new String[] { "please divide this",
"divide this sentence",
"this sentence into",
@ -210,7 +210,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"", true, false);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please", "pleasedivide",
"divide", "divideinto",
"into", "intoshingles",
@ -224,7 +224,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"", false, false);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "pleasedivide",
"divideinto",
"intoshingles" },
@ -239,7 +239,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
null, true, false);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please", "pleasedivide",
"divide", "divideinto",
"into", "intoshingles",
@ -253,7 +253,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"", false, false);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "pleasedivide",
"divideinto",
"intoshingles" },
@ -267,7 +267,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"<SEP>", true, false);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please", "please<SEP>divide",
"divide", "divide<SEP>into",
"into", "into<SEP>shingles",
@ -281,7 +281,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"<SEP>", false, false);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please<SEP>divide",
"divide<SEP>into",
"into<SEP>shingles" },
@ -296,7 +296,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"", false, true);
assertAnalyzesToReuse(analyzer, "please",
assertAnalyzesTo(analyzer, "please",
new String[] { "please" },
new int[] { 0 },
new int[] { 6 },

View File

@ -1134,7 +1134,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new ShingleFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
public void testTrailingHole1() throws IOException {

View File

@ -114,7 +114,7 @@ public class TestSnowball extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new SnowballFilter(tokenizer, lang));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -34,8 +34,8 @@ public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
Analyzer a = new SwedishAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "jaktkarlarne", "jaktkarl");
checkOneTermReuse(a, "jaktkarlens", "jaktkarl");
checkOneTerm(a, "jaktkarlarne", "jaktkarl");
checkOneTerm(a, "jaktkarlens", "jaktkarl");
// stopword
assertAnalyzesTo(a, "och", new String[] {});
}
@ -45,8 +45,8 @@ public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("jaktkarlarne"), false);
Analyzer a = new SwedishAnalyzer(TEST_VERSION_CURRENT,
SwedishAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "jaktkarlarne", "jaktkarlarne");
checkOneTermReuse(a, "jaktkarlens", "jaktkarl");
checkOneTerm(a, "jaktkarlarne", "jaktkarlarne");
checkOneTerm(a, "jaktkarlens", "jaktkarl");
}
/** blast some random strings through the analyzer */

View File

@ -75,6 +75,6 @@ public class TestSwedishLightStemFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new SwedishLightStemFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -92,14 +92,14 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
public void testReusableTokenStream() throws Exception {
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
assertAnalyzesToReuse(analyzer, "", new String[] {});
assertAnalyzesTo(analyzer, "", new String[] {});
assertAnalyzesToReuse(
assertAnalyzesTo(
analyzer,
"การที่ได้ต้องแสดงว่างานดี",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
assertAnalyzesToReuse(
assertAnalyzesTo(
analyzer,
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
@ -136,6 +136,6 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new ThaiWordFilter(TEST_VERSION_CURRENT, tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -34,8 +34,8 @@ public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
Analyzer a = new TurkishAnalyzer(TEST_VERSION_CURRENT);
// stemming
checkOneTermReuse(a, "ağacı", "ağaç");
checkOneTermReuse(a, "ağaç", "ağaç");
checkOneTerm(a, "ağacı", "ağaç");
checkOneTerm(a, "ağaç", "ağaç");
// stopword
assertAnalyzesTo(a, "dolayı", new String[] {});
}
@ -45,8 +45,8 @@ public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("ağacı"), false);
Analyzer a = new TurkishAnalyzer(TEST_VERSION_CURRENT,
TurkishAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "ağacı", "ağacı");
checkOneTermReuse(a, "ağaç", "ağaç");
checkOneTerm(a, "ağacı", "ağacı");
checkOneTerm(a, "ağaç", "ağaç");
}
/** blast some random strings through the analyzer */

View File

@ -83,6 +83,6 @@ public class TestTurkishLowerCaseFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new TurkishLowerCaseFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -69,7 +69,7 @@ public class TestElision extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new ElisionFilter(tokenizer, FrenchAnalyzer.DEFAULT_ARTICLES));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -45,8 +45,7 @@ public final class ICUTokenizer extends Tokenizer {
/** true length of text in the buffer */
private int length = 0;
/** length in buffer that can be evaluated safely, up to a safe end point */
// note: usableLength is -1 here to best-effort AIOOBE consumers that don't call reset()
private int usableLength = -1;
private int usableLength = 0;
/** accumulated offset of previous buffers for this reader, for offsetAtt */
private int offset = 0;

View File

@ -87,6 +87,6 @@ public class TestICUFoldingFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new ICUFoldingFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -87,6 +87,6 @@ public class TestICUNormalizer2Filter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -109,6 +109,6 @@ public class TestICUTransformFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, Transliterator.getInstance("Any-Latin")));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

View File

@ -207,7 +207,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
}
public void testReusableTokenStream() throws Exception {
assertAnalyzesToReuse(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
assertAnalyzesTo(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག", "མི", "ཉམས", "གོང",
"འཕེལ", "དུ", "གཏོང", "བར", "", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" });
}

View File

@ -191,14 +191,14 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
}
public void testReusableTokenStream() throws IOException {
assertAnalyzesToReuse(analyzer, "あいうえおabcかきくけこ",
assertAnalyzesTo(analyzer, "あいうえおabcかきくけこ",
new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" },
new int[] { 0, 1, 2, 3, 5, 8, 9, 10, 11 },
new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 },
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1});
assertAnalyzesToReuse(analyzer, "あいうえおabんcかきくけ こ",
assertAnalyzesTo(analyzer, "あいうえおabんcかきくけ こ",
new String[] { "あい", "いう", "うえ", "えお", "ab", "", "c", "かき", "きく", "くけ", "" },
new int[] { 0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 14 },
new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 },

View File

@ -243,7 +243,7 @@ public final class JapaneseTokenizer extends Tokenizer {
outputCompounds = false;
break;
}
buffer.reset(null); // best effort NPE consumers that don't call reset()
buffer.reset(this.input);
resetState();
@ -260,8 +260,15 @@ public final class JapaneseTokenizer extends Tokenizer {
this.dotOut = dotOut;
}
@Override
public void close() throws IOException {
super.close();
buffer.reset(input);
}
@Override
public void reset() throws IOException {
super.reset();
buffer.reset(input);
resetState();
}

View File

@ -75,6 +75,6 @@ public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new JapaneseBaseFormFilter(tokenizer));
}
};
checkOneTermReuse(a, "", "");
checkOneTerm(a, "", "");
}
}

Some files were not shown because too many files have changed in this diff Show More