mirror of https://github.com/apache/lucene.git
LUCENE-5235: Tokenizers now throw an IllegalStateException if the consumer does not call reset() before consuming the stream. Previous versions throwed NullPointerException or ArrayIndexOutOfBoundsException on best effort which was not user-friendly.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1525362 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
adba0da045
commit
34adebab3b
|
@ -74,6 +74,12 @@ New Features
|
|||
* LUCENE-5219: Add support to SynonymFilterFactory for custom
|
||||
parsers. (Ryan Ernst via Robert Muir)
|
||||
|
||||
* LUCENE-5235: Tokenizers now throw an IllegalStateException if the
|
||||
consumer does not call reset() before consuming the stream. Previous
|
||||
versions throwed NullPointerException or ArrayIndexOutOfBoundsException
|
||||
on best effort which was not user-friendly.
|
||||
(Uwe Schindler, Robert Muir)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead
|
||||
|
@ -94,6 +100,11 @@ Documentation
|
|||
|
||||
Changes in backwards compatibility policy
|
||||
|
||||
* LUCENE-5235: Sub classes of Tokenizer have to call super.reset()
|
||||
when implementing reset(). Otherwise the consumer will get an
|
||||
IllegalStateException because the Reader is not correctly assigned.
|
||||
(Uwe Schindler, Robert Muir)
|
||||
|
||||
* LUCENE-5204: Directory doesn't have default implementations for
|
||||
LockFactory-related methods, which have been moved to BaseDirectory. If you
|
||||
had a custom Directory implementation that extended Directory, you need to
|
||||
|
|
|
@ -88,6 +88,7 @@ public final class KeywordTokenizer extends Tokenizer {
|
|||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
this.done = false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -140,7 +140,8 @@ public final class Lucene43NGramTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void end() {
|
||||
public void end() throws IOException {
|
||||
super.end();
|
||||
// set final offset
|
||||
final int finalOffset = correctOffset(charsRead);
|
||||
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
|
|
|
@ -138,6 +138,7 @@ public final class PatternTokenizer extends Tokenizer {
|
|||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
fillBuffer(str, input);
|
||||
matcher.reset(str);
|
||||
index = 0;
|
||||
|
|
|
@ -114,7 +114,7 @@ public final class ClassicTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
private void init(Version matchVersion) {
|
||||
this.scanner = new ClassicTokenizerImpl(null); // best effort NPE if you dont call reset
|
||||
this.scanner = new ClassicTokenizerImpl(input);
|
||||
}
|
||||
|
||||
// this tokenizer generates three attributes:
|
||||
|
@ -171,8 +171,15 @@ public final class ClassicTokenizer extends Tokenizer {
|
|||
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
scanner.yyreset(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
scanner.yyreset(input);
|
||||
skippedPositions = 0;
|
||||
}
|
||||
|
|
|
@ -128,7 +128,7 @@ public final class StandardTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
private final void init(Version matchVersion) {
|
||||
this.scanner = new StandardTokenizerImpl(null); // best effort NPE if you dont call reset
|
||||
this.scanner = new StandardTokenizerImpl(input);
|
||||
}
|
||||
|
||||
// this tokenizer generates three attributes:
|
||||
|
@ -179,8 +179,15 @@ public final class StandardTokenizer extends Tokenizer {
|
|||
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
scanner.yyreset(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
scanner.yyreset(input);
|
||||
skippedPositions = 0;
|
||||
}
|
||||
|
|
|
@ -111,8 +111,8 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
|
|||
this.scanner = getScannerFor(matchVersion);
|
||||
}
|
||||
|
||||
private static StandardTokenizerInterface getScannerFor(Version matchVersion) {
|
||||
return new UAX29URLEmailTokenizerImpl(null); // best effort NPE if you dont call reset
|
||||
private StandardTokenizerInterface getScannerFor(Version matchVersion) {
|
||||
return new UAX29URLEmailTokenizerImpl(input);
|
||||
}
|
||||
|
||||
// this tokenizer generates three attributes:
|
||||
|
@ -158,8 +158,15 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
|
|||
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
scanner.yyreset(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
scanner.yyreset(input);
|
||||
skippedPositions = 0;
|
||||
}
|
||||
|
|
|
@ -62,8 +62,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
charUtils = CharacterUtils.getInstance(matchVersion);
|
||||
}
|
||||
|
||||
// note: bufferIndex is -1 here to best-effort AIOOBE consumers that don't call reset()
|
||||
private int offset = 0, bufferIndex = -1, dataLen = 0, finalOffset = 0;
|
||||
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
|
||||
private static final int MAX_WORD_LEN = 255;
|
||||
private static final int IO_BUFFER_SIZE = 4096;
|
||||
|
||||
|
@ -150,6 +149,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
bufferIndex = 0;
|
||||
offset = 0;
|
||||
dataLen = 0;
|
||||
|
|
|
@ -143,7 +143,7 @@ public final class WikipediaTokenizer extends Tokenizer {
|
|||
*/
|
||||
public WikipediaTokenizer(Reader input, int tokenOutput, Set<String> untokenizedTypes) {
|
||||
super(input);
|
||||
this.scanner = new WikipediaTokenizerImpl(null); // best effort NPE if you dont call reset
|
||||
this.scanner = new WikipediaTokenizerImpl(this.input);
|
||||
init(tokenOutput, untokenizedTypes);
|
||||
}
|
||||
|
||||
|
@ -156,7 +156,7 @@ public final class WikipediaTokenizer extends Tokenizer {
|
|||
*/
|
||||
public WikipediaTokenizer(AttributeFactory factory, Reader input, int tokenOutput, Set<String> untokenizedTypes) {
|
||||
super(factory, input);
|
||||
this.scanner = new WikipediaTokenizerImpl(null); // best effort NPE if you dont call reset
|
||||
this.scanner = new WikipediaTokenizerImpl(this.input);
|
||||
init(tokenOutput, untokenizedTypes);
|
||||
}
|
||||
|
||||
|
@ -295,6 +295,12 @@ public final class WikipediaTokenizer extends Tokenizer {
|
|||
offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
scanner.yyreset(input);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
|
@ -302,6 +308,7 @@ public final class WikipediaTokenizer extends Tokenizer {
|
|||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
scanner.yyreset(input);
|
||||
tokens = null;
|
||||
scanner.reset();
|
||||
|
|
|
@ -60,8 +60,8 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
|
|||
*/
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesToReuse(a, "كبير", new String[] { "كبير" });
|
||||
assertAnalyzesToReuse(a, "كبيرة", new String[] { "كبير" }); // feminine marker
|
||||
assertAnalyzesTo(a, "كبير", new String[] { "كبير" });
|
||||
assertAnalyzesTo(a, "كبيرة", new String[] { "كبير" }); // feminine marker
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -86,12 +86,12 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, asSet("ساهدهات"), false);
|
||||
ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
||||
assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
|
||||
assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
|
||||
assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
|
||||
|
||||
|
||||
a = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, CharArraySet.EMPTY_SET);
|
||||
assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
|
||||
assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
|
||||
assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -102,7 +102,7 @@ public class TestArabicNormalizationFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new ArabicNormalizationFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -142,6 +142,6 @@ public class TestArabicStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new ArabicStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -49,8 +49,8 @@ public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testReusableTokenStream() throws IOException {
|
||||
Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesToReuse(a, "документи", new String[] {"документ"});
|
||||
assertAnalyzesToReuse(a, "документ", new String[] {"документ"});
|
||||
assertAnalyzesTo(a, "документи", new String[] {"документ"});
|
||||
assertAnalyzesTo(a, "документ", new String[] {"документ"});
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -234,6 +234,6 @@ public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new BulgarianStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -157,7 +157,7 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
private void checkReuse(Analyzer a, String input, String expected) throws Exception {
|
||||
checkOneTermReuse(a, input, expected);
|
||||
checkOneTerm(a, input, expected);
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
@ -173,6 +173,6 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new BrazilianStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,8 +34,8 @@ public class TestCatalanAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws IOException {
|
||||
Analyzer a = new CatalanAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "llengües", "llengu");
|
||||
checkOneTermReuse(a, "llengua", "llengu");
|
||||
checkOneTerm(a, "llengües", "llengu");
|
||||
checkOneTerm(a, "llengua", "llengu");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "un", new String[] { });
|
||||
}
|
||||
|
@ -52,8 +52,8 @@ public class TestCatalanAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("llengües"), false);
|
||||
Analyzer a = new CatalanAnalyzer(TEST_VERSION_CURRENT,
|
||||
CatalanAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "llengües", "llengües");
|
||||
checkOneTermReuse(a, "llengua", "llengu");
|
||||
checkOneTerm(a, "llengües", "llengües");
|
||||
checkOneTerm(a, "llengua", "llengu");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -167,14 +167,14 @@ public class TestCJKAnalyzer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testReusableTokenStream() throws IOException {
|
||||
assertAnalyzesToReuse(analyzer, "あいうえおabcかきくけこ",
|
||||
assertAnalyzesTo(analyzer, "あいうえおabcかきくけこ",
|
||||
new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" },
|
||||
new int[] { 0, 1, 2, 3, 5, 8, 9, 10, 11 },
|
||||
new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 },
|
||||
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1});
|
||||
|
||||
assertAnalyzesToReuse(analyzer, "あいうえおabんcかきくけ こ",
|
||||
assertAnalyzesTo(analyzer, "あいうえおabんcかきくけ こ",
|
||||
new String[] { "あい", "いう", "うえ", "えお", "ab", "ん", "c", "かき", "きく", "くけ", "こ" },
|
||||
new int[] { 0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 14 },
|
||||
new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 },
|
||||
|
@ -288,6 +288,6 @@ public class TestCJKAnalyzer extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new CJKBigramFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -74,6 +74,6 @@ public class TestCJKWidthFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new CJKWidthFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -377,7 +377,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
|
||||
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
|
||||
final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
|
||||
|
@ -390,6 +390,6 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(b, "", "");
|
||||
checkOneTerm(b, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,8 +39,8 @@ public class TestCzechAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer analyzer = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvim", "voln" });
|
||||
assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česk", "republik" });
|
||||
assertAnalyzesTo(analyzer, "Pokud mluvime o volnem", new String[] { "mluvim", "voln" });
|
||||
assertAnalyzesTo(analyzer, "Česká Republika", new String[] { "česk", "republik" });
|
||||
}
|
||||
|
||||
public void testWithStemExclusionSet() throws IOException{
|
||||
|
|
|
@ -294,7 +294,7 @@ public class TestCzechStemmer extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new CzechStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -34,8 +34,8 @@ public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws IOException {
|
||||
Analyzer a = new DanishAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "undersøg", "undersøg");
|
||||
checkOneTermReuse(a, "undersøgelse", "undersøg");
|
||||
checkOneTerm(a, "undersøg", "undersøg");
|
||||
checkOneTerm(a, "undersøgelse", "undersøg");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "på", new String[] {});
|
||||
}
|
||||
|
@ -45,8 +45,8 @@ public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("undersøgelse"), false);
|
||||
Analyzer a = new DanishAnalyzer(TEST_VERSION_CURRENT,
|
||||
DanishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "undersøgelse", "undersøgelse");
|
||||
checkOneTermReuse(a, "undersøg", "undersøg");
|
||||
checkOneTerm(a, "undersøgelse", "undersøgelse");
|
||||
checkOneTerm(a, "undersøg", "undersøg");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -29,9 +29,9 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT);
|
||||
checkOneTermReuse(a, "Tisch", "tisch");
|
||||
checkOneTermReuse(a, "Tische", "tisch");
|
||||
checkOneTermReuse(a, "Tischen", "tisch");
|
||||
checkOneTerm(a, "Tisch", "tisch");
|
||||
checkOneTerm(a, "Tische", "tisch");
|
||||
checkOneTerm(a, "Tischen", "tisch");
|
||||
}
|
||||
|
||||
public void testWithKeywordAttribute() throws IOException {
|
||||
|
@ -46,7 +46,7 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testStemExclusionTable() throws Exception {
|
||||
GermanAnalyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET,
|
||||
new CharArraySet(TEST_VERSION_CURRENT, asSet("tischen"), false));
|
||||
checkOneTermReuse(a, "tischen", "tischen");
|
||||
checkOneTerm(a, "tischen", "tischen");
|
||||
}
|
||||
|
||||
/** test some features of the new snowball filter
|
||||
|
@ -55,8 +55,8 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testGermanSpecials() throws Exception {
|
||||
GermanAnalyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT);
|
||||
// a/o/u + e is equivalent to the umlaut form
|
||||
checkOneTermReuse(a, "Schaltflächen", "schaltflach");
|
||||
checkOneTermReuse(a, "Schaltflaechen", "schaltflach");
|
||||
checkOneTerm(a, "Schaltflächen", "schaltflach");
|
||||
checkOneTerm(a, "Schaltflaechen", "schaltflach");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -75,6 +75,6 @@ public class TestGermanLightStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new GermanLightStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -87,6 +87,6 @@ public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new GermanMinimalStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -75,6 +75,6 @@ public class TestGermanNormalizationFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new GermanNormalizationFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -88,6 +88,6 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new GermanStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -51,16 +51,16 @@ public class GreekAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
|
||||
// Verify the correct analysis of capitals and small accented letters, and
|
||||
// stemming
|
||||
assertAnalyzesToReuse(a, "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας",
|
||||
assertAnalyzesTo(a, "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας",
|
||||
new String[] { "μια", "εξαιρετ", "καλ", "πλουσ", "σειρ", "χαρακτηρ",
|
||||
"ελληνικ", "γλωσσ" });
|
||||
// Verify the correct analysis of small letters with diaeresis and the elimination
|
||||
// of punctuation marks
|
||||
assertAnalyzesToReuse(a, "Προϊόντα (και) [πολλαπλές] - ΑΝΑΓΚΕΣ",
|
||||
assertAnalyzesTo(a, "Προϊόντα (και) [πολλαπλές] - ΑΝΑΓΚΕΣ",
|
||||
new String[] { "προιοντ", "πολλαπλ", "αναγκ" });
|
||||
// Verify the correct analysis of capital accented letters and capital letters with diaeresis,
|
||||
// as well as the elimination of stop words
|
||||
assertAnalyzesToReuse(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
|
||||
assertAnalyzesTo(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
|
||||
new String[] { "προυποθεσ", "αψογ", "μεστ", "αλλ" });
|
||||
}
|
||||
|
||||
|
|
|
@ -536,6 +536,6 @@ public class TestGreekStemmer extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new GreekStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,14 +34,14 @@ public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws IOException {
|
||||
Analyzer a = new EnglishAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "books", "book");
|
||||
checkOneTermReuse(a, "book", "book");
|
||||
checkOneTerm(a, "books", "book");
|
||||
checkOneTerm(a, "book", "book");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "the", new String[] {});
|
||||
// possessive removal
|
||||
checkOneTermReuse(a, "steven's", "steven");
|
||||
checkOneTermReuse(a, "steven\u2019s", "steven");
|
||||
checkOneTermReuse(a, "steven\uFF07s", "steven");
|
||||
checkOneTerm(a, "steven's", "steven");
|
||||
checkOneTerm(a, "steven\u2019s", "steven");
|
||||
checkOneTerm(a, "steven\uFF07s", "steven");
|
||||
}
|
||||
|
||||
/** test use of exclusion set */
|
||||
|
@ -49,8 +49,8 @@ public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("books"), false);
|
||||
Analyzer a = new EnglishAnalyzer(TEST_VERSION_CURRENT,
|
||||
EnglishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "books", "books");
|
||||
checkOneTermReuse(a, "book", "book");
|
||||
checkOneTerm(a, "books", "books");
|
||||
checkOneTerm(a, "book", "book");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -65,6 +65,6 @@ public class TestEnglishMinimalStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new EnglishMinimalStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -62,7 +62,7 @@ public class TestKStemmer extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new KStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
|
||||
/****** requires original java kstem source code to create map
|
||||
|
|
|
@ -74,6 +74,6 @@ public class TestPorterStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new PorterStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,8 +34,8 @@ public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws IOException {
|
||||
Analyzer a = new SpanishAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "chicana", "chican");
|
||||
checkOneTermReuse(a, "chicano", "chican");
|
||||
checkOneTerm(a, "chicana", "chican");
|
||||
checkOneTerm(a, "chicano", "chican");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "los", new String[] {});
|
||||
}
|
||||
|
@ -45,8 +45,8 @@ public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("chicano"), false);
|
||||
Analyzer a = new SpanishAnalyzer(TEST_VERSION_CURRENT,
|
||||
SpanishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "chicana", "chican");
|
||||
checkOneTermReuse(a, "chicano", "chicano");
|
||||
checkOneTerm(a, "chicana", "chican");
|
||||
checkOneTerm(a, "chicano", "chicano");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -59,6 +59,6 @@ public class TestSpanishLightStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new SpanishLightStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,8 +34,8 @@ public class TestBasqueAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws IOException {
|
||||
Analyzer a = new BasqueAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "zaldi", "zaldi");
|
||||
checkOneTermReuse(a, "zaldiak", "zaldi");
|
||||
checkOneTerm(a, "zaldi", "zaldi");
|
||||
checkOneTerm(a, "zaldiak", "zaldi");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "izan", new String[] { });
|
||||
}
|
||||
|
@ -45,8 +45,8 @@ public class TestBasqueAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("zaldiak"), false);
|
||||
Analyzer a = new BasqueAnalyzer(TEST_VERSION_CURRENT,
|
||||
BasqueAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "zaldiak", "zaldiak");
|
||||
checkOneTermReuse(a, "mendiari", "mendi");
|
||||
checkOneTerm(a, "zaldiak", "zaldiak");
|
||||
checkOneTerm(a, "mendiari", "mendi");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -208,8 +208,8 @@ public class TestPersianAnalyzer extends BaseTokenStreamTestCase {
|
|||
*/
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesToReuse(a, "خورده مي شده بوده باشد", new String[] { "خورده" });
|
||||
assertAnalyzesToReuse(a, "برگها", new String[] { "برگ" });
|
||||
assertAnalyzesTo(a, "خورده مي شده بوده باشد", new String[] { "خورده" });
|
||||
assertAnalyzesTo(a, "برگها", new String[] { "برگ" });
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -72,7 +72,7 @@ public class TestPersianNormalizationFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new PersianNormalizationFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -34,8 +34,8 @@ public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws IOException {
|
||||
Analyzer a = new FinnishAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
|
||||
checkOneTermReuse(a, "edeltäjistään", "edeltäj");
|
||||
checkOneTerm(a, "edeltäjiinsä", "edeltäj");
|
||||
checkOneTerm(a, "edeltäjistään", "edeltäj");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "olla", new String[] {});
|
||||
}
|
||||
|
@ -45,8 +45,8 @@ public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("edeltäjistään"), false);
|
||||
Analyzer a = new FinnishAnalyzer(TEST_VERSION_CURRENT,
|
||||
FinnishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
|
||||
checkOneTermReuse(a, "edeltäjistään", "edeltäjistään");
|
||||
checkOneTerm(a, "edeltäjiinsä", "edeltäj");
|
||||
checkOneTerm(a, "edeltäjistään", "edeltäjistään");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -75,6 +75,6 @@ public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new FinnishLightStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -117,13 +117,13 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testReusableTokenStream() throws Exception {
|
||||
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stopwords
|
||||
assertAnalyzesToReuse(
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"le la chien les aux chat du des à cheval",
|
||||
new String[] { "chien", "chat", "cheval" });
|
||||
|
||||
// some nouns and adjectives
|
||||
assertAnalyzesToReuse(
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"lances chismes habitable chiste éléments captifs",
|
||||
new String[] {
|
||||
|
@ -140,7 +140,7 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
|
|||
set.add("habitable");
|
||||
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT,
|
||||
CharArraySet.EMPTY_SET, set);
|
||||
assertAnalyzesToReuse(fa, "habitable chiste", new String[] { "habitable",
|
||||
assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
|
||||
"chist" });
|
||||
|
||||
fa = new FrenchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
||||
|
@ -169,7 +169,7 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
|
|||
/** test accent-insensitive */
|
||||
public void testAccentInsensitive() throws Exception {
|
||||
Analyzer a = new FrenchAnalyzer(TEST_VERSION_CURRENT);
|
||||
checkOneTermReuse(a, "sécuritaires", "securitair");
|
||||
checkOneTermReuse(a, "securitaires", "securitair");
|
||||
checkOneTerm(a, "sécuritaires", "securitair");
|
||||
checkOneTerm(a, "securitaires", "securitair");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -205,6 +205,6 @@ public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new FrenchLightStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -89,6 +89,6 @@ public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new FrenchMinimalStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,8 +34,8 @@ public class TestIrishAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws IOException {
|
||||
Analyzer a = new IrishAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "siopadóireacht", "siopadóir");
|
||||
checkOneTermReuse(a, "síceapatacha", "síceapaite");
|
||||
checkOneTerm(a, "siopadóireacht", "siopadóir");
|
||||
checkOneTerm(a, "síceapatacha", "síceapaite");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "le", new String[] { });
|
||||
}
|
||||
|
@ -52,8 +52,8 @@ public class TestIrishAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("feirmeoireacht"), false);
|
||||
Analyzer a = new IrishAnalyzer(TEST_VERSION_CURRENT,
|
||||
IrishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "feirmeoireacht", "feirmeoireacht");
|
||||
checkOneTermReuse(a, "siopadóireacht", "siopadóir");
|
||||
checkOneTerm(a, "feirmeoireacht", "feirmeoireacht");
|
||||
checkOneTerm(a, "siopadóireacht", "siopadóir");
|
||||
}
|
||||
|
||||
/** test special hyphen handling */
|
||||
|
|
|
@ -52,6 +52,6 @@ public class TestIrishLowerCaseFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new IrishLowerCaseFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,8 +34,8 @@ public class TestGalicianAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws IOException {
|
||||
Analyzer a = new GalicianAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "correspondente", "correspond");
|
||||
checkOneTermReuse(a, "corresponderá", "correspond");
|
||||
checkOneTerm(a, "correspondente", "correspond");
|
||||
checkOneTerm(a, "corresponderá", "correspond");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "e", new String[] {});
|
||||
}
|
||||
|
@ -45,8 +45,8 @@ public class TestGalicianAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("correspondente"), false);
|
||||
Analyzer a = new GalicianAnalyzer(TEST_VERSION_CURRENT,
|
||||
GalicianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "correspondente", "correspondente");
|
||||
checkOneTermReuse(a, "corresponderá", "correspond");
|
||||
checkOneTerm(a, "correspondente", "correspondente");
|
||||
checkOneTerm(a, "corresponderá", "correspond");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -79,6 +79,6 @@ public class TestGalicianMinimalStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new GalicianMinimalStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -58,6 +58,6 @@ public class TestGalicianStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new GalicianStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,15 +34,15 @@ public class TestHindiAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws Exception {
|
||||
Analyzer a = new HindiAnalyzer(TEST_VERSION_CURRENT);
|
||||
// two ways to write 'hindi' itself.
|
||||
checkOneTermReuse(a, "हिन्दी", "हिंद");
|
||||
checkOneTermReuse(a, "हिंदी", "हिंद");
|
||||
checkOneTerm(a, "हिन्दी", "हिंद");
|
||||
checkOneTerm(a, "हिंदी", "हिंद");
|
||||
}
|
||||
|
||||
public void testExclusionSet() throws Exception {
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("हिंदी"), false);
|
||||
Analyzer a = new HindiAnalyzer(TEST_VERSION_CURRENT,
|
||||
HindiAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "हिंदी", "हिंदी");
|
||||
checkOneTerm(a, "हिंदी", "हिंदी");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -75,6 +75,6 @@ public class TestHindiNormalizer extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new HindiNormalizationFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -97,6 +97,6 @@ public class TestHindiStemmer extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new HindiStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,8 +34,8 @@ public class TestHungarianAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws IOException {
|
||||
Analyzer a = new HungarianAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "babakocsi", "babakocs");
|
||||
checkOneTermReuse(a, "babakocsijáért", "babakocs");
|
||||
checkOneTerm(a, "babakocsi", "babakocs");
|
||||
checkOneTerm(a, "babakocsijáért", "babakocs");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "által", new String[] {});
|
||||
}
|
||||
|
@ -45,8 +45,8 @@ public class TestHungarianAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("babakocsi"), false);
|
||||
Analyzer a = new HungarianAnalyzer(TEST_VERSION_CURRENT,
|
||||
HungarianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "babakocsi", "babakocsi");
|
||||
checkOneTermReuse(a, "babakocsijáért", "babakocs");
|
||||
checkOneTerm(a, "babakocsi", "babakocsi");
|
||||
checkOneTerm(a, "babakocsijáért", "babakocs");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -70,6 +70,6 @@ public class TestHungarianLightStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new HungarianLightStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -89,6 +89,6 @@ public class HunspellStemFilterTest extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, _TestUtil.nextInt(random(), 1, 3)));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,8 +34,8 @@ public class TestArmenianAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws IOException {
|
||||
Analyzer a = new ArmenianAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "արծիվ", "արծ");
|
||||
checkOneTermReuse(a, "արծիվներ", "արծ");
|
||||
checkOneTerm(a, "արծիվ", "արծ");
|
||||
checkOneTerm(a, "արծիվներ", "արծ");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "է", new String[] { });
|
||||
}
|
||||
|
@ -45,8 +45,8 @@ public class TestArmenianAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("արծիվներ"), false);
|
||||
Analyzer a = new ArmenianAnalyzer(TEST_VERSION_CURRENT,
|
||||
ArmenianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "արծիվներ", "արծիվներ");
|
||||
checkOneTermReuse(a, "արծիվ", "արծ");
|
||||
checkOneTerm(a, "արծիվներ", "արծիվներ");
|
||||
checkOneTerm(a, "արծիվ", "արծ");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -34,8 +34,8 @@ public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws IOException {
|
||||
Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "peledakan", "ledak");
|
||||
checkOneTermReuse(a, "pembunuhan", "bunuh");
|
||||
checkOneTerm(a, "peledakan", "ledak");
|
||||
checkOneTerm(a, "pembunuhan", "bunuh");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "bahwa", new String[] {});
|
||||
}
|
||||
|
@ -45,8 +45,8 @@ public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("peledakan"), false);
|
||||
Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT,
|
||||
IndonesianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "peledakan", "peledakan");
|
||||
checkOneTermReuse(a, "pembunuhan", "bunuh");
|
||||
checkOneTerm(a, "peledakan", "peledakan");
|
||||
checkOneTerm(a, "pembunuhan", "bunuh");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -41,73 +41,73 @@ public class TestIndonesianStemmer extends BaseTokenStreamTestCase {
|
|||
/** Some examples from the paper */
|
||||
public void testExamples() throws IOException {
|
||||
checkOneTerm(a, "bukukah", "buku");
|
||||
checkOneTermReuse(a, "adalah", "ada");
|
||||
checkOneTermReuse(a, "bukupun", "buku");
|
||||
checkOneTermReuse(a, "bukuku", "buku");
|
||||
checkOneTermReuse(a, "bukumu", "buku");
|
||||
checkOneTermReuse(a, "bukunya", "buku");
|
||||
checkOneTermReuse(a, "mengukur", "ukur");
|
||||
checkOneTermReuse(a, "menyapu", "sapu");
|
||||
checkOneTermReuse(a, "menduga", "duga");
|
||||
checkOneTermReuse(a, "menuduh", "uduh");
|
||||
checkOneTermReuse(a, "membaca", "baca");
|
||||
checkOneTermReuse(a, "merusak", "rusak");
|
||||
checkOneTermReuse(a, "pengukur", "ukur");
|
||||
checkOneTermReuse(a, "penyapu", "sapu");
|
||||
checkOneTermReuse(a, "penduga", "duga");
|
||||
checkOneTermReuse(a, "pembaca", "baca");
|
||||
checkOneTermReuse(a, "diukur", "ukur");
|
||||
checkOneTermReuse(a, "tersapu", "sapu");
|
||||
checkOneTermReuse(a, "kekasih", "kasih");
|
||||
checkOneTermReuse(a, "berlari", "lari");
|
||||
checkOneTermReuse(a, "belajar", "ajar");
|
||||
checkOneTermReuse(a, "bekerja", "kerja");
|
||||
checkOneTermReuse(a, "perjelas", "jelas");
|
||||
checkOneTermReuse(a, "pelajar", "ajar");
|
||||
checkOneTermReuse(a, "pekerja", "kerja");
|
||||
checkOneTermReuse(a, "tarikkan", "tarik");
|
||||
checkOneTermReuse(a, "ambilkan", "ambil");
|
||||
checkOneTermReuse(a, "mengambilkan", "ambil");
|
||||
checkOneTermReuse(a, "makanan", "makan");
|
||||
checkOneTermReuse(a, "janjian", "janji");
|
||||
checkOneTermReuse(a, "perjanjian", "janji");
|
||||
checkOneTermReuse(a, "tandai", "tanda");
|
||||
checkOneTermReuse(a, "dapati", "dapat");
|
||||
checkOneTermReuse(a, "mendapati", "dapat");
|
||||
checkOneTermReuse(a, "pantai", "panta");
|
||||
checkOneTerm(a, "adalah", "ada");
|
||||
checkOneTerm(a, "bukupun", "buku");
|
||||
checkOneTerm(a, "bukuku", "buku");
|
||||
checkOneTerm(a, "bukumu", "buku");
|
||||
checkOneTerm(a, "bukunya", "buku");
|
||||
checkOneTerm(a, "mengukur", "ukur");
|
||||
checkOneTerm(a, "menyapu", "sapu");
|
||||
checkOneTerm(a, "menduga", "duga");
|
||||
checkOneTerm(a, "menuduh", "uduh");
|
||||
checkOneTerm(a, "membaca", "baca");
|
||||
checkOneTerm(a, "merusak", "rusak");
|
||||
checkOneTerm(a, "pengukur", "ukur");
|
||||
checkOneTerm(a, "penyapu", "sapu");
|
||||
checkOneTerm(a, "penduga", "duga");
|
||||
checkOneTerm(a, "pembaca", "baca");
|
||||
checkOneTerm(a, "diukur", "ukur");
|
||||
checkOneTerm(a, "tersapu", "sapu");
|
||||
checkOneTerm(a, "kekasih", "kasih");
|
||||
checkOneTerm(a, "berlari", "lari");
|
||||
checkOneTerm(a, "belajar", "ajar");
|
||||
checkOneTerm(a, "bekerja", "kerja");
|
||||
checkOneTerm(a, "perjelas", "jelas");
|
||||
checkOneTerm(a, "pelajar", "ajar");
|
||||
checkOneTerm(a, "pekerja", "kerja");
|
||||
checkOneTerm(a, "tarikkan", "tarik");
|
||||
checkOneTerm(a, "ambilkan", "ambil");
|
||||
checkOneTerm(a, "mengambilkan", "ambil");
|
||||
checkOneTerm(a, "makanan", "makan");
|
||||
checkOneTerm(a, "janjian", "janji");
|
||||
checkOneTerm(a, "perjanjian", "janji");
|
||||
checkOneTerm(a, "tandai", "tanda");
|
||||
checkOneTerm(a, "dapati", "dapat");
|
||||
checkOneTerm(a, "mendapati", "dapat");
|
||||
checkOneTerm(a, "pantai", "panta");
|
||||
}
|
||||
|
||||
/** Some detailed analysis examples (that might not be the best) */
|
||||
public void testIRExamples() throws IOException {
|
||||
checkOneTerm(a, "penyalahgunaan", "salahguna");
|
||||
checkOneTermReuse(a, "menyalahgunakan", "salahguna");
|
||||
checkOneTermReuse(a, "disalahgunakan", "salahguna");
|
||||
checkOneTerm(a, "menyalahgunakan", "salahguna");
|
||||
checkOneTerm(a, "disalahgunakan", "salahguna");
|
||||
|
||||
checkOneTermReuse(a, "pertanggungjawaban", "tanggungjawab");
|
||||
checkOneTermReuse(a, "mempertanggungjawabkan", "tanggungjawab");
|
||||
checkOneTermReuse(a, "dipertanggungjawabkan", "tanggungjawab");
|
||||
checkOneTerm(a, "pertanggungjawaban", "tanggungjawab");
|
||||
checkOneTerm(a, "mempertanggungjawabkan", "tanggungjawab");
|
||||
checkOneTerm(a, "dipertanggungjawabkan", "tanggungjawab");
|
||||
|
||||
checkOneTermReuse(a, "pelaksanaan", "laksana");
|
||||
checkOneTermReuse(a, "pelaksana", "laksana");
|
||||
checkOneTermReuse(a, "melaksanakan", "laksana");
|
||||
checkOneTermReuse(a, "dilaksanakan", "laksana");
|
||||
checkOneTerm(a, "pelaksanaan", "laksana");
|
||||
checkOneTerm(a, "pelaksana", "laksana");
|
||||
checkOneTerm(a, "melaksanakan", "laksana");
|
||||
checkOneTerm(a, "dilaksanakan", "laksana");
|
||||
|
||||
checkOneTermReuse(a, "melibatkan", "libat");
|
||||
checkOneTermReuse(a, "terlibat", "libat");
|
||||
checkOneTerm(a, "melibatkan", "libat");
|
||||
checkOneTerm(a, "terlibat", "libat");
|
||||
|
||||
checkOneTermReuse(a, "penculikan", "culik");
|
||||
checkOneTermReuse(a, "menculik", "culik");
|
||||
checkOneTermReuse(a, "diculik", "culik");
|
||||
checkOneTermReuse(a, "penculik", "culik");
|
||||
checkOneTerm(a, "penculikan", "culik");
|
||||
checkOneTerm(a, "menculik", "culik");
|
||||
checkOneTerm(a, "diculik", "culik");
|
||||
checkOneTerm(a, "penculik", "culik");
|
||||
|
||||
checkOneTermReuse(a, "perubahan", "ubah");
|
||||
checkOneTermReuse(a, "peledakan", "ledak");
|
||||
checkOneTermReuse(a, "penanganan", "tangan");
|
||||
checkOneTermReuse(a, "kepolisian", "polisi");
|
||||
checkOneTermReuse(a, "kenaikan", "naik");
|
||||
checkOneTermReuse(a, "bersenjata", "senjata");
|
||||
checkOneTermReuse(a, "penyelewengan", "seleweng");
|
||||
checkOneTermReuse(a, "kecelakaan", "celaka");
|
||||
checkOneTerm(a, "perubahan", "ubah");
|
||||
checkOneTerm(a, "peledakan", "ledak");
|
||||
checkOneTerm(a, "penanganan", "tangan");
|
||||
checkOneTerm(a, "kepolisian", "polisi");
|
||||
checkOneTerm(a, "kenaikan", "naik");
|
||||
checkOneTerm(a, "bersenjata", "senjata");
|
||||
checkOneTerm(a, "penyelewengan", "seleweng");
|
||||
checkOneTerm(a, "kecelakaan", "celaka");
|
||||
}
|
||||
|
||||
/* inflectional-only stemming */
|
||||
|
@ -122,15 +122,15 @@ public class TestIndonesianStemmer extends BaseTokenStreamTestCase {
|
|||
/** Test stemming only inflectional suffixes */
|
||||
public void testInflectionalOnly() throws IOException {
|
||||
checkOneTerm(b, "bukunya", "buku");
|
||||
checkOneTermReuse(b, "bukukah", "buku");
|
||||
checkOneTermReuse(b, "bukunyakah", "buku");
|
||||
checkOneTermReuse(b, "dibukukannya", "dibukukan");
|
||||
checkOneTerm(b, "bukukah", "buku");
|
||||
checkOneTerm(b, "bukunyakah", "buku");
|
||||
checkOneTerm(b, "dibukukannya", "dibukukan");
|
||||
}
|
||||
|
||||
public void testShouldntStem() throws IOException {
|
||||
checkOneTerm(a, "bersenjata", "senjata");
|
||||
checkOneTermReuse(a, "bukukah", "buku");
|
||||
checkOneTermReuse(a, "gigi", "gigi");
|
||||
checkOneTerm(a, "bukukah", "buku");
|
||||
checkOneTerm(a, "gigi", "gigi");
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
|
@ -141,6 +141,6 @@ public class TestIndonesianStemmer extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -60,6 +60,6 @@ public class TestIndicNormalizer extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new IndicNormalizationFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -37,8 +37,8 @@ public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws IOException {
|
||||
Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "abbandonata", "abbandonat");
|
||||
checkOneTermReuse(a, "abbandonati", "abbandonat");
|
||||
checkOneTerm(a, "abbandonata", "abbandonat");
|
||||
checkOneTerm(a, "abbandonati", "abbandonat");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "dallo", new String[] {});
|
||||
}
|
||||
|
@ -48,8 +48,8 @@ public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("abbandonata"), false);
|
||||
Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT,
|
||||
ItalianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "abbandonata", "abbandonata");
|
||||
checkOneTermReuse(a, "abbandonati", "abbandonat");
|
||||
checkOneTerm(a, "abbandonata", "abbandonata");
|
||||
checkOneTerm(a, "abbandonati", "abbandonat");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -59,6 +59,6 @@ public class TestItalianLightStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new ItalianLightStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,8 +34,8 @@ public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws IOException {
|
||||
Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "tirgiem", "tirg");
|
||||
checkOneTermReuse(a, "tirgus", "tirg");
|
||||
checkOneTerm(a, "tirgiem", "tirg");
|
||||
checkOneTerm(a, "tirgus", "tirg");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "un", new String[] {});
|
||||
}
|
||||
|
@ -45,8 +45,8 @@ public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("tirgiem"), false);
|
||||
Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT,
|
||||
LatvianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "tirgiem", "tirgiem");
|
||||
checkOneTermReuse(a, "tirgus", "tirg");
|
||||
checkOneTerm(a, "tirgiem", "tirgiem");
|
||||
checkOneTerm(a, "tirgus", "tirg");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -278,6 +278,6 @@ public class TestLatvianStemmer extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new LatvianStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1934,6 +1934,6 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new ASCIIFoldingFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -143,6 +143,6 @@ public class TestCapitalizationFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new CapitalizationFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -85,6 +85,6 @@ public class TestHyphenatedWordsFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new HyphenatedWordsFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -48,7 +48,7 @@ public class TestLengthFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new LengthFilter(TEST_VERSION_CURRENT, tokenizer, 0, 5));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -175,7 +175,7 @@ public class TestRemoveDuplicatesTokenFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -115,6 +115,6 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new TrimFilter(version, tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -197,7 +197,6 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q"));
|
||||
tk = new ShingleFilter(tk);
|
||||
tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10);
|
||||
tk.reset();
|
||||
assertTokenStreamContents(tk,
|
||||
new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
|
||||
new int[] { 6,11,11,14 },
|
||||
|
|
|
@ -115,24 +115,24 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testSnowballCorrectness() throws Exception {
|
||||
Analyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
|
||||
checkOneTermReuse(a, "opheffen", "opheff");
|
||||
checkOneTermReuse(a, "opheffende", "opheff");
|
||||
checkOneTermReuse(a, "opheffing", "opheff");
|
||||
checkOneTerm(a, "opheffen", "opheff");
|
||||
checkOneTerm(a, "opheffende", "opheff");
|
||||
checkOneTerm(a, "opheffing", "opheff");
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
|
||||
checkOneTermReuse(a, "lichaamsziek", "lichaamsziek");
|
||||
checkOneTermReuse(a, "lichamelijk", "licham");
|
||||
checkOneTermReuse(a, "lichamelijke", "licham");
|
||||
checkOneTermReuse(a, "lichamelijkheden", "licham");
|
||||
checkOneTerm(a, "lichaamsziek", "lichaamsziek");
|
||||
checkOneTerm(a, "lichamelijk", "licham");
|
||||
checkOneTerm(a, "lichamelijke", "licham");
|
||||
checkOneTerm(a, "lichamelijkheden", "licham");
|
||||
}
|
||||
|
||||
public void testExclusionTableViaCtor() throws IOException {
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||
set.add("lichamelijk");
|
||||
DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
||||
assertAnalyzesToReuse(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
|
||||
assertAnalyzesTo(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
|
||||
|
||||
a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
||||
assertAnalyzesTo(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
|
||||
|
|
|
@ -34,8 +34,8 @@ public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws IOException {
|
||||
Analyzer a = new NorwegianAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "havnedistriktene", "havnedistrikt");
|
||||
checkOneTermReuse(a, "havnedistrikter", "havnedistrikt");
|
||||
checkOneTerm(a, "havnedistriktene", "havnedistrikt");
|
||||
checkOneTerm(a, "havnedistrikter", "havnedistrikt");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "det", new String[] {});
|
||||
}
|
||||
|
@ -45,8 +45,8 @@ public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("havnedistriktene"), false);
|
||||
Analyzer a = new NorwegianAnalyzer(TEST_VERSION_CURRENT,
|
||||
NorwegianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "havnedistriktene", "havnedistriktene");
|
||||
checkOneTermReuse(a, "havnedistrikter", "havnedistrikt");
|
||||
checkOneTerm(a, "havnedistriktene", "havnedistriktene");
|
||||
checkOneTerm(a, "havnedistrikter", "havnedistrikt");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -93,6 +93,6 @@ public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new NorwegianLightStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -92,6 +92,6 @@ public class TestNorwegianMinimalStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new NorwegianMinimalStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -114,7 +114,7 @@ public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new PatternReplaceFilter(tokenizer, Pattern.compile("a"), "b", true));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -34,6 +34,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
|||
DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
|
||||
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
|
||||
filter.reset();
|
||||
assertTermEquals("The", filter, termAtt, payAtt, null);
|
||||
assertTermEquals("quick", filter, termAtt, payAtt, "JJ".getBytes("UTF-8"));
|
||||
assertTermEquals("red", filter, termAtt, payAtt, "JJ".getBytes("UTF-8"));
|
||||
|
@ -45,6 +46,8 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
|||
assertTermEquals("brown", filter, termAtt, payAtt, "JJ".getBytes("UTF-8"));
|
||||
assertTermEquals("dogs", filter, termAtt, payAtt, "NN".getBytes("UTF-8"));
|
||||
assertFalse(filter.incrementToken());
|
||||
filter.end();
|
||||
filter.close();
|
||||
}
|
||||
|
||||
public void testNext() throws Exception {
|
||||
|
@ -53,6 +56,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
|||
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter
|
||||
(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false),
|
||||
DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
|
||||
filter.reset();
|
||||
assertTermEquals("The", filter, null);
|
||||
assertTermEquals("quick", filter, "JJ".getBytes("UTF-8"));
|
||||
assertTermEquals("red", filter, "JJ".getBytes("UTF-8"));
|
||||
|
@ -64,6 +68,8 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
|||
assertTermEquals("brown", filter, "JJ".getBytes("UTF-8"));
|
||||
assertTermEquals("dogs", filter, "NN".getBytes("UTF-8"));
|
||||
assertFalse(filter.incrementToken());
|
||||
filter.end();
|
||||
filter.close();
|
||||
}
|
||||
|
||||
|
||||
|
@ -72,6 +78,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
|||
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false), '|', new FloatEncoder());
|
||||
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
|
||||
filter.reset();
|
||||
assertTermEquals("The", filter, termAtt, payAtt, null);
|
||||
assertTermEquals("quick", filter, termAtt, payAtt, PayloadHelper.encodeFloat(1.0f));
|
||||
assertTermEquals("red", filter, termAtt, payAtt, PayloadHelper.encodeFloat(2.0f));
|
||||
|
@ -83,6 +90,8 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
|||
assertTermEquals("brown", filter, termAtt, payAtt, PayloadHelper.encodeFloat(99.3f));
|
||||
assertTermEquals("dogs", filter, termAtt, payAtt, PayloadHelper.encodeFloat(83.7f));
|
||||
assertFalse(filter.incrementToken());
|
||||
filter.end();
|
||||
filter.close();
|
||||
}
|
||||
|
||||
public void testIntEncoding() throws Exception {
|
||||
|
@ -90,6 +99,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
|||
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false), '|', new IntegerEncoder());
|
||||
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
|
||||
filter.reset();
|
||||
assertTermEquals("The", filter, termAtt, payAtt, null);
|
||||
assertTermEquals("quick", filter, termAtt, payAtt, PayloadHelper.encodeInt(1));
|
||||
assertTermEquals("red", filter, termAtt, payAtt, PayloadHelper.encodeInt(2));
|
||||
|
@ -101,12 +111,13 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
|||
assertTermEquals("brown", filter, termAtt, payAtt, PayloadHelper.encodeInt(99));
|
||||
assertTermEquals("dogs", filter, termAtt, payAtt, PayloadHelper.encodeInt(83));
|
||||
assertFalse(filter.incrementToken());
|
||||
filter.end();
|
||||
filter.close();
|
||||
}
|
||||
|
||||
void assertTermEquals(String expected, TokenStream stream, byte[] expectPay) throws Exception {
|
||||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||
PayloadAttribute payloadAtt = stream.getAttribute(PayloadAttribute.class);
|
||||
stream.reset();
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals(expected, termAtt.toString());
|
||||
BytesRef payload = payloadAtt.getPayload();
|
||||
|
@ -123,7 +134,6 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
|||
|
||||
|
||||
void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, PayloadAttribute payAtt, byte[] expectPay) throws Exception {
|
||||
stream.reset();
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals(expected, termAtt.toString());
|
||||
BytesRef payload = payAtt.getPayload();
|
||||
|
|
|
@ -34,8 +34,8 @@ public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws IOException {
|
||||
Analyzer a = new PortugueseAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "quilométricas", "quilometric");
|
||||
checkOneTermReuse(a, "quilométricos", "quilometric");
|
||||
checkOneTerm(a, "quilométricas", "quilometric");
|
||||
checkOneTerm(a, "quilométricos", "quilometric");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "não", new String[] {});
|
||||
}
|
||||
|
@ -45,8 +45,8 @@ public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false);
|
||||
Analyzer a = new PortugueseAnalyzer(TEST_VERSION_CURRENT,
|
||||
PortugueseAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "quilométricas", "quilométricas");
|
||||
checkOneTermReuse(a, "quilométricos", "quilometric");
|
||||
checkOneTerm(a, "quilométricas", "quilométricas");
|
||||
checkOneTerm(a, "quilométricos", "quilometric");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -123,6 +123,6 @@ public class TestPortugueseLightStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new PortugueseLightStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -97,6 +97,6 @@ public class TestPortugueseMinimalStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new PortugueseMinimalStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -96,6 +96,6 @@ public class TestPortugueseStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new PortugueseStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -113,6 +113,6 @@ public class TestReverseStringFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(TEST_VERSION_CURRENT, tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,8 +34,8 @@ public class TestRomanianAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws IOException {
|
||||
Analyzer a = new RomanianAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "absenţa", "absenţ");
|
||||
checkOneTermReuse(a, "absenţi", "absenţ");
|
||||
checkOneTerm(a, "absenţa", "absenţ");
|
||||
checkOneTerm(a, "absenţi", "absenţ");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "îl", new String[] {});
|
||||
}
|
||||
|
@ -45,8 +45,8 @@ public class TestRomanianAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("absenţa"), false);
|
||||
Analyzer a = new RomanianAnalyzer(TEST_VERSION_CURRENT,
|
||||
RomanianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "absenţa", "absenţa");
|
||||
checkOneTermReuse(a, "absenţi", "absenţ");
|
||||
checkOneTerm(a, "absenţa", "absenţa");
|
||||
checkOneTerm(a, "absenţi", "absenţ");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -39,9 +39,9 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new RussianAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
|
||||
assertAnalyzesTo(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
|
||||
new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
|
||||
assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
|
||||
assertAnalyzesTo(a, "Но знание это хранилось в тайне",
|
||||
new String[] { "знан", "эт", "хран", "тайн" });
|
||||
}
|
||||
|
||||
|
@ -50,7 +50,7 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||
set.add("представление");
|
||||
Analyzer a = new RussianAnalyzer(TEST_VERSION_CURRENT, RussianAnalyzer.getDefaultStopSet() , set);
|
||||
assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
|
||||
assertAnalyzesTo(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
|
||||
new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представление" });
|
||||
|
||||
}
|
||||
|
|
|
@ -75,6 +75,6 @@ public class TestRussianLightStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new RussianLightStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -140,12 +140,12 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new ShingleAnalyzerWrapper(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 2);
|
||||
assertAnalyzesToReuse(a, "please divide into shingles",
|
||||
assertAnalyzesTo(a, "please divide into shingles",
|
||||
new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" },
|
||||
new int[] { 0, 0, 7, 7, 14, 14, 19 },
|
||||
new int[] { 6, 13, 13, 18, 18, 27, 27 },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1 });
|
||||
assertAnalyzesToReuse(a, "divide me up again",
|
||||
assertAnalyzesTo(a, "divide me up again",
|
||||
new String[] { "divide", "divide me", "me", "me up", "up", "up again", "again" },
|
||||
new int[] { 0, 0, 7, 7, 10, 10, 13 },
|
||||
new int[] { 6, 9, 9, 12, 12, 18, 18 },
|
||||
|
@ -155,7 +155,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
public void testNonDefaultMinShingleSize() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer
|
||||
= new ShingleAnalyzerWrapper(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4);
|
||||
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
|
||||
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
|
||||
new String[] { "please", "please divide this", "please divide this sentence",
|
||||
"divide", "divide this sentence", "divide this sentence into",
|
||||
"this", "this sentence into", "this sentence into shingles",
|
||||
|
@ -168,7 +168,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
|
||||
analyzer = new ShingleAnalyzerWrapper(
|
||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4, ShingleFilter.TOKEN_SEPARATOR, false, false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
|
||||
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
|
||||
new String[] { "please divide this", "please divide this sentence",
|
||||
"divide this sentence", "divide this sentence into",
|
||||
"this sentence into", "this sentence into shingles",
|
||||
|
@ -181,7 +181,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
public void testNonDefaultMinAndSameMaxShingleSize() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer
|
||||
= new ShingleAnalyzerWrapper(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3);
|
||||
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
|
||||
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
|
||||
new String[] { "please", "please divide this",
|
||||
"divide", "divide this sentence",
|
||||
"this", "this sentence into",
|
||||
|
@ -194,7 +194,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
|
||||
analyzer = new ShingleAnalyzerWrapper(
|
||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3, ShingleFilter.TOKEN_SEPARATOR, false, false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
|
||||
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
|
||||
new String[] { "please divide this",
|
||||
"divide this sentence",
|
||||
"this sentence into",
|
||||
|
@ -210,7 +210,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
"", true, false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||
new String[] { "please", "pleasedivide",
|
||||
"divide", "divideinto",
|
||||
"into", "intoshingles",
|
||||
|
@ -224,7 +224,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
"", false, false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||
new String[] { "pleasedivide",
|
||||
"divideinto",
|
||||
"intoshingles" },
|
||||
|
@ -239,7 +239,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
null, true, false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||
new String[] { "please", "pleasedivide",
|
||||
"divide", "divideinto",
|
||||
"into", "intoshingles",
|
||||
|
@ -253,7 +253,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
"", false, false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||
new String[] { "pleasedivide",
|
||||
"divideinto",
|
||||
"intoshingles" },
|
||||
|
@ -267,7 +267,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
"<SEP>", true, false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||
new String[] { "please", "please<SEP>divide",
|
||||
"divide", "divide<SEP>into",
|
||||
"into", "into<SEP>shingles",
|
||||
|
@ -281,7 +281,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
"<SEP>", false, false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||
new String[] { "please<SEP>divide",
|
||||
"divide<SEP>into",
|
||||
"into<SEP>shingles" },
|
||||
|
@ -296,7 +296,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
"", false, true);
|
||||
assertAnalyzesToReuse(analyzer, "please",
|
||||
assertAnalyzesTo(analyzer, "please",
|
||||
new String[] { "please" },
|
||||
new int[] { 0 },
|
||||
new int[] { 6 },
|
||||
|
|
|
@ -1134,7 +1134,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new ShingleFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
|
||||
public void testTrailingHole1() throws IOException {
|
||||
|
|
|
@ -114,7 +114,7 @@ public class TestSnowball extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new SnowballFilter(tokenizer, lang));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -34,8 +34,8 @@ public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws IOException {
|
||||
Analyzer a = new SwedishAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "jaktkarlarne", "jaktkarl");
|
||||
checkOneTermReuse(a, "jaktkarlens", "jaktkarl");
|
||||
checkOneTerm(a, "jaktkarlarne", "jaktkarl");
|
||||
checkOneTerm(a, "jaktkarlens", "jaktkarl");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "och", new String[] {});
|
||||
}
|
||||
|
@ -45,8 +45,8 @@ public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("jaktkarlarne"), false);
|
||||
Analyzer a = new SwedishAnalyzer(TEST_VERSION_CURRENT,
|
||||
SwedishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "jaktkarlarne", "jaktkarlarne");
|
||||
checkOneTermReuse(a, "jaktkarlens", "jaktkarl");
|
||||
checkOneTerm(a, "jaktkarlarne", "jaktkarlarne");
|
||||
checkOneTerm(a, "jaktkarlens", "jaktkarl");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -75,6 +75,6 @@ public class TestSwedishLightStemFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new SwedishLightStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -92,14 +92,14 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
|
||||
assertAnalyzesToReuse(analyzer, "", new String[] {});
|
||||
assertAnalyzesTo(analyzer, "", new String[] {});
|
||||
|
||||
assertAnalyzesToReuse(
|
||||
assertAnalyzesTo(
|
||||
analyzer,
|
||||
"การที่ได้ต้องแสดงว่างานดี",
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
|
||||
|
||||
assertAnalyzesToReuse(
|
||||
assertAnalyzesTo(
|
||||
analyzer,
|
||||
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
|
||||
new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
|
||||
|
@ -136,6 +136,6 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new ThaiWordFilter(TEST_VERSION_CURRENT, tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,8 +34,8 @@ public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testBasics() throws IOException {
|
||||
Analyzer a = new TurkishAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "ağacı", "ağaç");
|
||||
checkOneTermReuse(a, "ağaç", "ağaç");
|
||||
checkOneTerm(a, "ağacı", "ağaç");
|
||||
checkOneTerm(a, "ağaç", "ağaç");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "dolayı", new String[] {});
|
||||
}
|
||||
|
@ -45,8 +45,8 @@ public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("ağacı"), false);
|
||||
Analyzer a = new TurkishAnalyzer(TEST_VERSION_CURRENT,
|
||||
TurkishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "ağacı", "ağacı");
|
||||
checkOneTermReuse(a, "ağaç", "ağaç");
|
||||
checkOneTerm(a, "ağacı", "ağacı");
|
||||
checkOneTerm(a, "ağaç", "ağaç");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -83,6 +83,6 @@ public class TestTurkishLowerCaseFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new TurkishLowerCaseFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -69,7 +69,7 @@ public class TestElision extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new ElisionFilter(tokenizer, FrenchAnalyzer.DEFAULT_ARTICLES));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -45,8 +45,7 @@ public final class ICUTokenizer extends Tokenizer {
|
|||
/** true length of text in the buffer */
|
||||
private int length = 0;
|
||||
/** length in buffer that can be evaluated safely, up to a safe end point */
|
||||
// note: usableLength is -1 here to best-effort AIOOBE consumers that don't call reset()
|
||||
private int usableLength = -1;
|
||||
private int usableLength = 0;
|
||||
/** accumulated offset of previous buffers for this reader, for offsetAtt */
|
||||
private int offset = 0;
|
||||
|
||||
|
|
|
@ -87,6 +87,6 @@ public class TestICUFoldingFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new ICUFoldingFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -87,6 +87,6 @@ public class TestICUNormalizer2Filter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -109,6 +109,6 @@ public class TestICUTransformFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, Transliterator.getInstance("Any-Latin")));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -207,7 +207,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
assertAnalyzesToReuse(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
|
||||
assertAnalyzesTo(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
|
||||
new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག", "མི", "ཉམས", "གོང",
|
||||
"འཕེལ", "དུ", "གཏོང", "བར", "ཧ", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" });
|
||||
}
|
||||
|
|
|
@ -191,14 +191,14 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testReusableTokenStream() throws IOException {
|
||||
assertAnalyzesToReuse(analyzer, "あいうえおabcかきくけこ",
|
||||
assertAnalyzesTo(analyzer, "あいうえおabcかきくけこ",
|
||||
new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" },
|
||||
new int[] { 0, 1, 2, 3, 5, 8, 9, 10, 11 },
|
||||
new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 },
|
||||
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1});
|
||||
|
||||
assertAnalyzesToReuse(analyzer, "あいうえおabんcかきくけ こ",
|
||||
assertAnalyzesTo(analyzer, "あいうえおabんcかきくけ こ",
|
||||
new String[] { "あい", "いう", "うえ", "えお", "ab", "ん", "c", "かき", "きく", "くけ", "こ" },
|
||||
new int[] { 0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 14 },
|
||||
new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 },
|
||||
|
|
|
@ -243,7 +243,7 @@ public final class JapaneseTokenizer extends Tokenizer {
|
|||
outputCompounds = false;
|
||||
break;
|
||||
}
|
||||
buffer.reset(null); // best effort NPE consumers that don't call reset()
|
||||
buffer.reset(this.input);
|
||||
|
||||
resetState();
|
||||
|
||||
|
@ -260,8 +260,15 @@ public final class JapaneseTokenizer extends Tokenizer {
|
|||
this.dotOut = dotOut;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
buffer.reset(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
buffer.reset(input);
|
||||
resetState();
|
||||
}
|
||||
|
|
|
@ -75,6 +75,6 @@ public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new JapaneseBaseFormFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue