LUCENE-3113: fix analyzer bugs found by MockTokenizer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1104519 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-05-17 20:16:40 +00:00
parent 85f38eb661
commit f1a1844fb9
59 changed files with 257 additions and 232 deletions

View File

@ -75,6 +75,12 @@ Bug Fixes
caused a problem if you consumed a tokenstream, then reused it, added different
attributes to it, and consumed it again. (Robert Muir, Uwe Schindler)
* LUCENE-3113: Fixed some minor analysis bugs: double-reset() in ReusableAnalyzerBase
and ShingleAnalyzerWrapper, missing end() implementations in PrefixAwareTokenFilter
and PrefixAndSuffixAwareTokenFilter, invocations of incrementToken() after it
already returned false in CommonGramsQueryFilter, HyphenatedWordsFilter,
ShingleFilter, and SynonymsFilter. (Robert Muir, Steven Rowe, Uwe Schindler)
New Features
* LUCENE-3016: Add analyzer for Latvian. (Robert Muir)

View File

@ -100,7 +100,7 @@ public class MockTokenizer extends Tokenizer {
endOffset = off;
cp = readCodePoint();
} while (cp >= 0 && isTokenChar(cp));
offsetAtt.setOffset(startOffset, endOffset);
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset));
streamState = State.INCREMENT;
return true;
}

View File

@ -49,6 +49,7 @@ public final class CommonGramsQueryFilter extends TokenFilter {
private State previous;
private String previousType;
private boolean exhausted;
/**
* Constructs a new CommonGramsQueryFilter based on the provided CommomGramsFilter
@ -67,6 +68,7 @@ public final class CommonGramsQueryFilter extends TokenFilter {
super.reset();
previous = null;
previousType = null;
exhausted = false;
}
/**
@ -79,7 +81,7 @@ public final class CommonGramsQueryFilter extends TokenFilter {
*/
@Override
public boolean incrementToken() throws IOException {
while (input.incrementToken()) {
while (!exhausted && input.incrementToken()) {
State current = captureState();
if (previous != null && !isGramType()) {
@ -96,6 +98,8 @@ public final class CommonGramsQueryFilter extends TokenFilter {
previous = current;
}
exhausted = true;
if (previous == null || GRAM_TYPE.equals(previousType)) {
return false;
}

View File

@ -59,6 +59,7 @@ public final class HyphenatedWordsFilter extends TokenFilter {
private final StringBuilder hyphenated = new StringBuilder();
private State savedState;
private boolean exhausted = false;
/**
* Creates a new HyphenatedWordsFilter
@ -74,7 +75,7 @@ public final class HyphenatedWordsFilter extends TokenFilter {
*/
@Override
public boolean incrementToken() throws IOException {
while (input.incrementToken()) {
while (!exhausted && input.incrementToken()) {
char[] term = termAttribute.buffer();
int termLength = termAttribute.length();
@ -96,6 +97,8 @@ public final class HyphenatedWordsFilter extends TokenFilter {
}
}
exhausted = true;
if (savedState != null) {
// the final term ends with a hyphen
// add back the hyphen, for backwards compatibility.
@ -115,6 +118,7 @@ public final class HyphenatedWordsFilter extends TokenFilter {
super.reset();
hyphenated.setLength(0);
savedState = null;
exhausted = false;
}
// ================================================= Helper Methods ================================================

View File

@ -76,4 +76,9 @@ public class PrefixAndSuffixAwareTokenFilter extends TokenStream {
public void close() throws IOException {
suffix.close();
}
@Override
public void end() throws IOException {
suffix.end();
}
}

View File

@ -158,6 +158,12 @@ public class PrefixAwareTokenFilter extends TokenStream {
return suffixToken;
}
@Override
public void end() throws IOException {
prefix.end();
suffix.end();
}
@Override
public void close() throws IOException {
prefix.close();

View File

@ -225,7 +225,6 @@ public final class QueryAutoStopWordAnalyzer extends Analyzer {
TokenStream result = delegate.reusableTokenStream(fieldName, reader);
if (result == streams.wrapped) {
/* the wrapped analyzer reused the stream */
streams.withStopFilter.reset();
} else {
/*
* the wrapped analyzer did not. if there are any stopwords for the

View File

@ -199,10 +199,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
setPreviousTokenStream(streams);
} else {
TokenStream result = defaultAnalyzer.reusableTokenStream(fieldName, reader);
if (result == streams.wrapped) {
/* the wrapped analyzer reused the stream */
streams.shingle.reset();
} else {
if (result != streams.wrapped) {
/* the wrapped analyzer did not, create a new shingle around the new one */
streams.wrapped = result;
streams.shingle = new ShingleFilter(streams.wrapped);

View File

@ -327,6 +327,8 @@ public final class ShingleFilter extends TokenFilter {
return tokenAvailable;
}
private boolean exhausted;
/**
* <p>Get the next token from the input stream.
* <p>If the next token has <code>positionIncrement > 1</code>,
@ -359,7 +361,7 @@ public final class ShingleFilter extends TokenFilter {
}
isNextInputStreamToken = false;
newTarget.isFiller = false;
} else if (input.incrementToken()) {
} else if (!exhausted && input.incrementToken()) {
if (null == target) {
newTarget = new InputWindowToken(cloneAttributes());
} else {
@ -387,6 +389,7 @@ public final class ShingleFilter extends TokenFilter {
}
} else {
newTarget = null;
exhausted = true;
}
return newTarget;
}
@ -435,7 +438,8 @@ public final class ShingleFilter extends TokenFilter {
inputWindow.clear();
numFillerTokensToInsert = 0;
isOutputHere = false;
noShingleOutput = true;
noShingleOutput = true;
exhausted = false;
if (outputUnigramsIfNoShingles && ! outputUnigrams) {
// Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
gramSize.minValue = minShingleSize;

View File

@ -190,14 +190,18 @@ public final class SynonymFilter extends TokenFilter {
private LinkedList<AttributeSource> buffer;
private LinkedList<AttributeSource> matched;
private boolean exhausted;
private AttributeSource nextTok() throws IOException {
if (buffer!=null && !buffer.isEmpty()) {
return buffer.removeFirst();
} else {
if (input.incrementToken()) {
if (!exhausted && input.incrementToken()) {
return this;
} else
} else {
exhausted = true;
return null;
}
}
}
@ -250,5 +254,6 @@ public final class SynonymFilter extends TokenFilter {
public void reset() throws IOException {
input.reset();
replacement = null;
exhausted = false;
}
}

View File

@ -159,8 +159,6 @@ public abstract class ReusableAnalyzerBase extends Analyzer {
*/
protected boolean reset(final Reader reader) throws IOException {
source.reset(reader);
if(sink != source)
sink.reset(); // only reset if the sink reference is different from source
return true;
}

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
@ -215,8 +215,7 @@ public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
public void testWithKeywordAttribute() throws IOException {
CharArraySet set = new CharArraySet(Version.LUCENE_31, 1, true);
set.add("строеве");
WhitespaceTokenizer tokenStream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
new StringReader("строевете строеве"));
MockTokenizer tokenStream = new MockTokenizer(new StringReader("строевете строеве"), MockTokenizer.WHITESPACE, false);
BulgarianStemFilter filter = new BulgarianStemFilter(
new KeywordMarkerFilter(tokenStream, set));

View File

@ -22,8 +22,8 @@ import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
public class TestMappingCharFilter extends BaseTokenStreamTestCase {
@ -64,55 +64,55 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
public void testNothingChange() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1});
}
public void test1to1() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "h" ) );
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1});
}
public void test1to2() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "j" ) );
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1});
}
public void test1to3() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "k" ) );
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1});
}
public void test2to4() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "ll" ) );
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2});
}
public void test2to1() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "aa" ) );
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2});
}
public void test3to1() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) );
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3});
}
public void test4to2() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) );
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4});
}
public void test5to0() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "empty" ) );
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[0]);
}
@ -136,7 +136,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
//
public void testTokenStream() throws Exception {
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "h i j k ll cccc bbb aa" ) ) );
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[]{"i","i","jj","kkk","llll","cc","b","a"},
new int[]{0,2,4,6,8,11,16,20},
@ -157,7 +157,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
public void testChained() throws Exception {
CharStream cs = new MappingCharFilter( normMap,
new MappingCharFilter( normMap, CharReader.get( new StringReader( "aaaa ll h" ) ) ) );
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[]{"a","llllllll","i"},
new int[]{0,5,8},

View File

@ -21,6 +21,7 @@ import java.io.StringReader;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
@ -90,7 +91,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
@Override
public TokenStream tokenStream(String field, Reader in) {
return new CommonGramsQueryFilter(new CommonGramsFilter(TEST_VERSION_CURRENT,
new WhitespaceTokenizer(TEST_VERSION_CURRENT, in), commonWords));
new MockTokenizer(in, MockTokenizer.WHITESPACE, false), commonWords));
}
};
@ -159,7 +160,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
@Override
public TokenStream tokenStream(String field, Reader in) {
return new CommonGramsFilter(TEST_VERSION_CURRENT,
new WhitespaceTokenizer(TEST_VERSION_CURRENT, in), commonWords);
new MockTokenizer(in, MockTokenizer.WHITESPACE, false), commonWords);
}
};
@ -245,7 +246,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
*/
public void testCaseSensitive() throws Exception {
final String input = "How The s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
MockTokenizer wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
TokenFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords);
assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
"s_a", "a", "a_brown", "brown", "brown_s", "s", "s_cow", "cow",
@ -257,7 +258,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
*/
public void testLastWordisStopWord() throws Exception {
final String input = "dog the";
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
MockTokenizer wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "dog_the" });
@ -268,7 +269,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
*/
public void testFirstWordisStopWord() throws Exception {
final String input = "the dog";
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
MockTokenizer wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the_dog" });
@ -279,7 +280,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
*/
public void testOneWordQueryStopWord() throws Exception {
final String input = "the";
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
MockTokenizer wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the" });
@ -290,7 +291,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
*/
public void testOneWordQuery() throws Exception {
final String input = "monster";
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
MockTokenizer wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "monster" });
@ -301,7 +302,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
*/
public void TestFirstAndLastStopWord() throws Exception {
final String input = "the of";
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
MockTokenizer wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the_of" });

View File

@ -21,6 +21,7 @@ import java.io.StringReader;
import org.xml.sax.InputSource;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
@ -35,8 +36,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
.getHyphenationTree(is);
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
"min veninde som er lidt af en læsehest")), hyphenator,
new MockTokenizer(new StringReader("min veninde som er lidt af en læsehest"), MockTokenizer.WHITESPACE, false),
hyphenator,
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
@ -55,8 +56,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
// the word basket will not be added due to the longest match option
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
"basketballkurv")), hyphenator, dict,
new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
hyphenator, dict,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
assertTokenStreamContents(tf,
@ -77,7 +78,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
TEST_VERSION_CURRENT,
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
hyphenator,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
2, 4);
@ -89,7 +90,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
tf = new HyphenationCompoundWordTokenFilter(
TEST_VERSION_CURRENT,
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
hyphenator,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
4, 6);
@ -101,7 +102,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
tf = new HyphenationCompoundWordTokenFilter(
TEST_VERSION_CURRENT,
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
hyphenator,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
4, 10);
@ -120,9 +121,10 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
"Sko", "Vind", "Rute", "Torkare", "Blad" };
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
new WhitespaceTokenizer(TEST_VERSION_CURRENT,
new MockTokenizer(
new StringReader(
"Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
"Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba"),
MockTokenizer.WHITESPACE, false),
dict);
assertTokenStreamContents(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor",
@ -149,7 +151,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
"Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral" };
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("Basfiolsfodralmakaregesäll")),
new MockTokenizer(new StringReader("Basfiolsfodralmakaregesäll"), MockTokenizer.WHITESPACE, false),
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true);

View File

@ -22,6 +22,7 @@ import java.util.ArrayList;
import java.util.Set;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@ -36,36 +37,23 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
public void testExactCase() throws IOException {
StringReader reader = new StringReader("Now is The Time");
Set<String> stopWords = asSet("is", "the", "Time");
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), stopWords, false);
final CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
assertTrue(stream.incrementToken());
assertEquals("Now", termAtt.toString());
assertTrue(stream.incrementToken());
assertEquals("The", termAtt.toString());
assertFalse(stream.incrementToken());
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords, false);
assertTokenStreamContents(stream, new String[] { "Now", "The" });
}
public void testIgnoreCase() throws IOException {
StringReader reader = new StringReader("Now is The Time");
Set<String> stopWords = asSet( "is", "the", "Time" );
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), stopWords, true);
final CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
assertTrue(stream.incrementToken());
assertEquals("Now", termAtt.toString());
assertFalse(stream.incrementToken());
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords, true);
assertTokenStreamContents(stream, new String[] { "Now" });
}
public void testStopFilt() throws IOException {
StringReader reader = new StringReader("Now is The Time");
String[] stopWords = new String[] { "is", "the", "Time" };
Set<Object> stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords);
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
final CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
assertTrue(stream.incrementToken());
assertEquals("Now", termAtt.toString());
assertTrue(stream.incrementToken());
assertEquals("The", termAtt.toString());
assertFalse(stream.incrementToken());
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
assertTokenStreamContents(stream, new String[] { "Now", "The" });
}
/**
@ -85,11 +73,11 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
Set<Object> stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords);
// with increments
StringReader reader = new StringReader(sb.toString());
StopFilter stpf = new StopFilter(Version.LUCENE_40, new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
StopFilter stpf = new StopFilter(Version.LUCENE_40, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
doTestStopPositons(stpf,true);
// without increments
reader = new StringReader(sb.toString());
stpf = new StopFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
stpf = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
doTestStopPositons(stpf,false);
// with increments, concatenating two stop filters
ArrayList<String> a0 = new ArrayList<String>();
@ -108,7 +96,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
Set<Object> stopSet0 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords0);
Set<Object> stopSet1 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords1);
reader = new StringReader(sb.toString());
StopFilter stpf0 = new StopFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), stopSet0); // first part of the set
StopFilter stpf0 = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet0); // first part of the set
stpf0.setEnablePositionIncrements(true);
StopFilter stpf01 = new StopFilter(TEST_VERSION_CURRENT, stpf0, stopSet1); // two stop filters concatenated!
doTestStopPositons(stpf01,true);
@ -119,6 +107,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
stpf.setEnablePositionIncrements(enableIcrements);
CharTermAttribute termAtt = stpf.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class);
stpf.reset();
for (int i=0; i<20; i+=3) {
assertTrue(stpf.incrementToken());
log("Token "+i+": "+stpf);
@ -127,6 +116,8 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,posIncrAtt.getPositionIncrement());
}
assertFalse(stpf.incrementToken());
stpf.end();
stpf.close();
}
// print debug info depending on VERBOSE

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
@ -278,7 +278,7 @@ public class TestCzechStemmer extends BaseTokenStreamTestCase {
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
set.add("hole");
CzechStemFilter filter = new CzechStemFilter(new KeywordMarkerFilter(
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("hole desek")), set));
new MockTokenizer(new StringReader("hole desek"), MockTokenizer.WHITESPACE, false), set));
assertTokenStreamContents(filter, new String[] { "hole", "desk" });
}

View File

@ -22,8 +22,8 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
@ -36,7 +36,7 @@ public class TestGermanLightStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new GermanLightStemFilter(source));
}
};

View File

@ -22,8 +22,8 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
@ -36,7 +36,7 @@ public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new GermanMinimalStemFilter(source));
}
};

View File

@ -22,8 +22,8 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
/**
@ -34,7 +34,7 @@ public class TestEnglishMinimalStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new EnglishMinimalStemFilter(source));
}
};

View File

@ -22,12 +22,11 @@ import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@ -41,7 +40,7 @@ public class TestPorterStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer t = new KeywordTokenizer(reader);
Tokenizer t = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
return new TokenStreamComponents(t, new PorterStemFilter(t));
}
};
@ -57,7 +56,7 @@ public class TestPorterStemFilter extends BaseTokenStreamTestCase {
public void testWithKeywordAttribute() throws IOException {
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
set.add("yourselves");
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("yourselves yours"));
Tokenizer tokenizer = new MockTokenizer(new StringReader("yourselves yours"), MockTokenizer.WHITESPACE, false);
TokenStream filter = new PorterStemFilter(new KeywordMarkerFilter(tokenizer, set));
assertTokenStreamContents(filter, new String[] {"yourselves", "your"});
}

View File

@ -22,8 +22,8 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
@ -36,7 +36,7 @@ public class TestSpanishLightStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new SpanishLightStemFilter(source));
}
};

View File

@ -22,8 +22,8 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
@ -36,7 +36,7 @@ public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new FinnishLightStemFilter(source));
}
};

View File

@ -22,8 +22,8 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
@ -36,7 +36,7 @@ public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new FrenchLightStemFilter(source));
}
};

View File

@ -22,8 +22,8 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
@ -36,7 +36,7 @@ public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new FrenchMinimalStemFilter(source));
}
};

View File

@ -21,9 +21,9 @@ import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Test HindiNormalizer
@ -59,8 +59,7 @@ public class TestHindiNormalizer extends BaseTokenStreamTestCase {
check("आईऊॠॡऐऔीूॄॣैौ", "अइउऋऌएओिुृॢेो");
}
private void check(String input, String output) throws IOException {
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
new StringReader(input));
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
TokenFilter tf = new HindiNormalizationFilter(tokenizer);
assertTokenStreamContents(tf, new String[] { output });
}

View File

@ -21,9 +21,9 @@ import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Test HindiStemmer
@ -81,8 +81,7 @@ public class TestHindiStemmer extends BaseTokenStreamTestCase {
}
private void check(String input, String output) throws IOException {
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
new StringReader(input));
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
TokenFilter tf = new HindiStemFilter(tokenizer);
assertTokenStreamContents(tf, new String[] { output });
}

View File

@ -22,8 +22,8 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
@ -36,7 +36,7 @@ public class TestHungarianLightStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new HungarianLightStemFilter(source));
}
};

View File

@ -21,9 +21,9 @@ import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Test IndicNormalizer
@ -44,8 +44,7 @@ public class TestIndicNormalizer extends BaseTokenStreamTestCase {
}
private void check(String input, String output) throws IOException {
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
new StringReader(input));
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);;
TokenFilter tf = new IndicNormalizationFilter(tokenizer);
assertTokenStreamContents(tf, new String[] { output });
}

View File

@ -22,8 +22,8 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
@ -36,7 +36,7 @@ public class TestItalianLightStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new ItalianLightStemFilter(source));
}
};

View File

@ -22,8 +22,8 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
/**
@ -33,7 +33,7 @@ public class TestLatvianStemmer extends BaseTokenStreamTestCase {
private Analyzer a = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new LatvianStemFilter(tokenizer));
}
};

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.miscellaneous;
*/
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@ -30,14 +31,14 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
// testLain1Accents() is a copy of TestLatin1AccentFilter.testU().
public void testLatin1Accents() throws Exception {
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader
TokenStream stream = new MockTokenizer(new StringReader
("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ"
+" Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij"
+" ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"));
+" ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"), MockTokenizer.WHITESPACE, false);
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream);
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
filter.reset();
assertTermEquals("Des", filter, termAtt);
assertTermEquals("mot", filter, termAtt);
assertTermEquals("cles", filter, termAtt);
@ -1891,10 +1892,11 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
expectedOutputTokens.add(expected.toString());
}
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(inputText.toString()));
TokenStream stream = new MockTokenizer(new StringReader(inputText.toString()), MockTokenizer.WHITESPACE, false);
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream);
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
Iterator<String> expectedIter = expectedOutputTokens.iterator();
filter.reset();
while (expectedIter.hasNext()) {
assertTermEquals(expectedIter.next(), filter, termAtt);
}

View File

@ -25,9 +25,8 @@ import java.util.Collection;
import java.util.List;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.miscellaneous.CapitalizationFilter.*;
@ -105,7 +104,7 @@ public class TestCapitalizationFilter extends BaseTokenStreamTestCase {
boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
int maxTokenLength) throws IOException {
assertCapitalizesTo(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
assertCapitalizesTo(new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false),
expected, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength,
maxWordCount, maxTokenLength);
}
@ -114,7 +113,7 @@ public class TestCapitalizationFilter extends BaseTokenStreamTestCase {
boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
int maxTokenLength) throws IOException {
assertCapitalizesTo(new KeywordTokenizer(new StringReader(input)),
assertCapitalizesTo(new MockTokenizer(new StringReader(input), MockTokenizer.KEYWORD, false),
new String[] { expected }, onlyFirstWord, keep, forceFirstLetter, okPrefix,
minWordLength, maxWordCount, maxTokenLength);
}

View File

@ -20,8 +20,8 @@ package org.apache.lucene.analysis.miscellaneous;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* HyphenatedWordsFilter test
@ -30,7 +30,7 @@ public class TestHyphenatedWordsFilter extends BaseTokenStreamTestCase {
public void testHyphenatedWords() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
// first test
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
TokenStream ts = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
ts = new HyphenatedWordsFilter(ts);
assertTokenStreamContents(ts,
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecological" });
@ -42,7 +42,7 @@ public class TestHyphenatedWordsFilter extends BaseTokenStreamTestCase {
public void testHyphenAtEnd() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-";
// first test
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
TokenStream ts = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
ts = new HyphenatedWordsFilter(ts);
assertTokenStreamContents(ts,
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecology-" });

View File

@ -22,8 +22,8 @@ import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
/** Test {@link KeepWordFilter} */
@ -38,22 +38,22 @@ public class TestKeepWordFilter extends BaseTokenStreamTestCase {
String input = "xxx yyy aaa zzz BBB ccc ddd EEE";
// Test Stopwords
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
TokenStream stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 });
// Now force case
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 });
// Test Stopwords
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 1, 1 });
// Now force case
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 });
}

View File

@ -8,9 +8,9 @@ import java.util.Locale;
import java.util.Set;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
@ -45,17 +45,17 @@ public class TestKeywordMarkerFilter extends BaseTokenStreamTestCase {
String[] output = new String[] { "the", "quick", "brown", "LuceneFox",
"jumps" };
assertTokenStreamContents(new LowerCaseFilterMock(
new KeywordMarkerFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
"The quIck browN LuceneFox Jumps")), set)), output);
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set)), output);
Set<String> jdkSet = new HashSet<String>();
jdkSet.add("LuceneFox");
assertTokenStreamContents(new LowerCaseFilterMock(
new KeywordMarkerFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
"The quIck browN LuceneFox Jumps")), jdkSet)), output);
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), jdkSet)), output);
Set<?> set2 = set;
assertTokenStreamContents(new LowerCaseFilterMock(
new KeywordMarkerFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
"The quIck browN LuceneFox Jumps")), set2)), output);
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set2)), output);
}
// LUCENE-2901
@ -63,8 +63,7 @@ public class TestKeywordMarkerFilter extends BaseTokenStreamTestCase {
TokenStream ts = new LowerCaseFilterMock(
new KeywordMarkerFilter(
new KeywordMarkerFilter(
new WhitespaceTokenizer(TEST_VERSION_CURRENT,
new StringReader("Dogs Trees Birds Houses")),
new MockTokenizer(new StringReader("Dogs Trees Birds Houses"), MockTokenizer.WHITESPACE, false),
new HashSet<String>(Arrays.asList(new String[] { "Birds", "Houses" }))),
new HashSet<String>(Arrays.asList(new String[] { "Dogs", "Trees" }))));

View File

@ -18,15 +18,13 @@ package org.apache.lucene.analysis.miscellaneous;
*/
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.StringReader;
public class TestLengthFilter extends BaseTokenStreamTestCase {
public void testFilterNoPosIncr() throws Exception {
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
TokenStream stream = new MockTokenizer(
new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false);
LengthFilter filter = new LengthFilter(false, stream, 2, 6);
assertTokenStreamContents(filter,
new String[]{"short", "ab", "foo"},
@ -35,8 +33,8 @@ public class TestLengthFilter extends BaseTokenStreamTestCase {
}
public void testFilterWithPosIncr() throws Exception {
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
TokenStream stream = new MockTokenizer(
new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false);
LengthFilter filter = new LengthFilter(true, stream, 2, 6);
assertTokenStreamContents(filter,
new String[]{"short", "ab", "foo"},

View File

@ -18,8 +18,8 @@ package org.apache.lucene.analysis.miscellaneous;
*/
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import java.io.IOException;
import java.io.StringReader;
@ -30,7 +30,7 @@ public class TestPrefixAndSuffixAwareTokenFilter extends BaseTokenStreamTestCase
PrefixAndSuffixAwareTokenFilter ts = new PrefixAndSuffixAwareTokenFilter(
new SingleTokenTokenStream(createToken("^", 0, 0)),
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("hello world")),
new MockTokenizer(new StringReader("hello world"), MockTokenizer.WHITESPACE, false),
new SingleTokenTokenStream(createToken("$", 0, 0)));
assertTokenStreamContents(ts,

View File

@ -18,8 +18,8 @@ package org.apache.lucene.analysis.miscellaneous;
*/
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import java.io.IOException;
import java.io.StringReader;
@ -41,7 +41,7 @@ public class TestPrefixAwareTokenFilter extends BaseTokenStreamTestCase {
// prefix and suffix using 2x prefix
ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(createToken("^", 0, 0)),
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("hello world")));
new MockTokenizer(new StringReader("hello world"), MockTokenizer.WHITESPACE, false));
ts = new PrefixAwareTokenFilter(ts, new SingleTokenTokenStream(createToken("$", 0, 0)));
assertTokenStreamContents(ts,

View File

@ -19,12 +19,11 @@ package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@ -127,8 +126,8 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
}
public void doSplit(final String input, String... output) throws Exception {
WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
new StringReader(input)), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, 1, 1, 0, 0, 0, 1, 0, 1, 1, null);
WordDelimiterFilter wdf = new WordDelimiterFilter(new MockTokenizer(
new StringReader(input), MockTokenizer.KEYWORD, false), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, 1, 1, 0, 0, 0, 1, 0, 1, 1, null);
assertTokenStreamContents(wdf, output);
}
@ -169,8 +168,8 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
}
public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
new StringReader(input)), 1,1,0,0,0,1,0,1,stemPossessive, null);
WordDelimiterFilter wdf = new WordDelimiterFilter(new MockTokenizer(
new StringReader(input), MockTokenizer.KEYWORD, false), 1,1,0,0,0,1,0,1,stemPossessive, null);
assertTokenStreamContents(wdf, output);
}
@ -216,7 +215,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
@Override
public TokenStream tokenStream(String field, Reader reader) {
return new WordDelimiterFilter(
new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader),
new MockTokenizer(reader, MockTokenizer.WHITESPACE, false),
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
}
};
@ -244,7 +243,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
public TokenStream tokenStream(String field, Reader reader) {
return new WordDelimiterFilter(
new LargePosIncTokenFilter(
new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader)),
new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)),
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
}
};
@ -276,7 +275,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
@Override
public TokenStream tokenStream(String field, Reader reader) {
StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), StandardAnalyzer.STOP_WORDS_SET);
new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), StandardAnalyzer.STOP_WORDS_SET);
filter.setEnablePositionIncrements(true);
return new WordDelimiterFilter(filter,
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);

View File

@ -17,6 +17,7 @@ package org.apache.lucene.analysis.ngram;
* limitations under the License.
*/
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
@ -32,7 +33,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
@Override
public void setUp() throws Exception {
super.setUp();
input = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
input = new MockTokenizer(new StringReader("abcde"), MockTokenizer.WHITESPACE, false);
}
public void testInvalidInput() throws Exception {
@ -91,7 +92,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
}
public void testSmallTokenInStream() throws Exception {
input = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abc de fgh"));
input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
}

View File

@ -17,6 +17,7 @@ package org.apache.lucene.analysis.ngram;
* limitations under the License.
*/
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
@ -32,7 +33,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
@Override
public void setUp() throws Exception {
super.setUp();
input = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
input = new MockTokenizer(new StringReader("abcde"), MockTokenizer.WHITESPACE, false);
}
public void testInvalidInput() throws Exception {
@ -80,7 +81,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
}
public void testSmallTokenInStream() throws Exception {
input = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abc de fgh"));
input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
}

View File

@ -24,8 +24,8 @@ import java.util.regex.Pattern;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Tests {@link PatternReplaceCharFilter}
@ -39,7 +39,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
final String BLOCK = "this is test.";
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1$2$3",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[] { "this", "is", "test." },
new int[] { 0, 5, 8 },
@ -52,8 +52,8 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
final String BLOCK = "aa bb cc";
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
assertFalse(ts.incrementToken());
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[] {});
}
// 012345678
@ -63,7 +63,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
final String BLOCK = "aa bb cc";
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2#$3",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[] { "aa#bb#cc" },
new int[] { 0 },
@ -78,7 +78,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
final String BLOCK = "aa bb cc dd";
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[] { "aa##bb###cc", "dd" },
new int[] { 0, 9 },
@ -92,7 +92,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
final String BLOCK = " a a";
CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[] { "aa", "aa" },
new int[] { 1, 4 },
@ -107,7 +107,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
final String BLOCK = "aa bb cc dd";
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[] { "aa#bb", "dd" },
new int[] { 0, 12 },
@ -122,7 +122,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
final String BLOCK = " aa bb cc --- aa bb aa bb cc";
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1 $2 $3",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" },
new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 },
@ -137,7 +137,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc";
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", ".",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" },
new int[] { 2, 8, 11, 15, 21, 25, 28, 36 },
@ -154,7 +154,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
CharReader.get( new StringReader( BLOCK ) ) );
cs = new PatternReplaceCharFilter( pattern("bb"), "b", ".", cs );
cs = new PatternReplaceCharFilter( pattern("ccc"), "c", ".", cs );
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" },
new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 },

View File

@ -18,8 +18,8 @@
package org.apache.lucene.analysis.pattern;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import java.io.StringReader;
import java.util.regex.Pattern;
@ -32,7 +32,7 @@ public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {
public void testReplaceAll() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
(new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false),
Pattern.compile("a*b"),
"-", true);
assertTokenStreamContents(ts,
@ -42,7 +42,7 @@ public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {
public void testReplaceFirst() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
(new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false),
Pattern.compile("a*b"),
"-", false);
assertTokenStreamContents(ts,
@ -52,7 +52,7 @@ public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {
public void testStripFirst() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
(new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false),
Pattern.compile("a*b"),
null, false);
assertTokenStreamContents(ts,
@ -62,7 +62,7 @@ public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {
public void testStripAll() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
(new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false),
Pattern.compile("a*b"),
null, true);
assertTokenStreamContents(ts,
@ -72,7 +72,7 @@ public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {
public void testReplaceAllWithBackRef() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
(new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false),
Pattern.compile("(a*)b"),
"$1\\$", true);
assertTokenStreamContents(ts,

View File

@ -16,8 +16,8 @@ package org.apache.lucene.analysis.payloads;
* limitations under the License.
*/
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.index.Payload;
@ -30,7 +30,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
public void testPayloads() throws Exception {
String test = "The quick|JJ red|JJ fox|NN jumped|VB over the lazy|JJ brown|JJ dogs|NN";
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)),
(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false),
DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
@ -51,7 +51,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
String test = "The quick|JJ red|JJ fox|NN jumped|VB over the lazy|JJ brown|JJ dogs|NN";
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)),
(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false),
DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
assertTermEquals("The", filter, null);
assertTermEquals("quick", filter, "JJ".getBytes("UTF-8"));
@ -69,7 +69,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
public void testFloatEncoding() throws Exception {
String test = "The quick|1.0 red|2.0 fox|3.5 jumped|0.5 over the lazy|5 brown|99.3 dogs|83.7";
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)), '|', new FloatEncoder());
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false), '|', new FloatEncoder());
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
assertTermEquals("The", filter, termAtt, payAtt, null);
@ -87,7 +87,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
public void testIntEncoding() throws Exception {
String test = "The quick|1 red|2 fox|3 jumped over the lazy|5 brown|99 dogs|83";
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)), '|', new IntegerEncoder());
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false), '|', new IntegerEncoder());
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
assertTermEquals("The", filter, termAtt, payAtt, null);
@ -106,6 +106,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
void assertTermEquals(String expected, TokenStream stream, byte[] expectPay) throws Exception {
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
PayloadAttribute payloadAtt = stream.getAttribute(PayloadAttribute.class);
stream.reset();
assertTrue(stream.incrementToken());
assertEquals(expected, termAtt.toString());
Payload payload = payloadAtt.getPayload();
@ -122,6 +123,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, PayloadAttribute payAtt, byte[] expectPay) throws Exception {
stream.reset();
assertTrue(stream.incrementToken());
assertEquals(expected, termAtt.toString());
Payload payload = payAtt.getPayload();

View File

@ -17,9 +17,9 @@ package org.apache.lucene.analysis.payloads;
*/
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
@ -32,11 +32,12 @@ public class NumericPayloadTokenFilterTest extends BaseTokenStreamTestCase {
public void test() throws IOException {
String test = "The quick red fox jumped over the lazy brown dogs";
NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test))), 3, "D");
NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false)), 3, "D");
boolean seenDogs = false;
CharTermAttribute termAtt = nptf.getAttribute(CharTermAttribute.class);
TypeAttribute typeAtt = nptf.getAttribute(TypeAttribute.class);
PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
nptf.reset();
while (nptf.incrementToken()) {
if (termAtt.toString().equals("dogs")) {
seenDogs = true;

View File

@ -17,7 +17,7 @@ package org.apache.lucene.analysis.payloads;
*/
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.index.Payload;
@ -30,11 +30,11 @@ public class TokenOffsetPayloadTokenFilterTest extends BaseTokenStreamTestCase {
public void test() throws IOException {
String test = "The quick red fox jumped over the lazy brown dogs";
TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)));
TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false));
int count = 0;
PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
OffsetAttribute offsetAtt = nptf.getAttribute(OffsetAttribute.class);
nptf.reset();
while (nptf.incrementToken()) {
Payload pay = payloadAtt.getPayload();
assertTrue("pay is null and it shouldn't be", pay != null);

View File

@ -17,9 +17,9 @@ package org.apache.lucene.analysis.payloads;
*/
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
@ -32,12 +32,12 @@ public class TypeAsPayloadTokenFilterTest extends BaseTokenStreamTestCase {
public void test() throws IOException {
String test = "The quick red fox jumped over the lazy brown dogs";
TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test))));
TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false)));
int count = 0;
CharTermAttribute termAtt = nptf.getAttribute(CharTermAttribute.class);
TypeAttribute typeAtt = nptf.getAttribute(TypeAttribute.class);
PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
nptf.reset();
while (nptf.incrementToken()) {
assertTrue(typeAtt.type() + " is not null and it should be", typeAtt.type().equals(String.valueOf(Character.toUpperCase(termAtt.buffer()[0]))));
assertTrue("nextToken.getPayload() is null and it shouldn't be", payloadAtt.getPayload() != null);

View File

@ -22,6 +22,8 @@ import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
@ -50,7 +52,7 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
public void setUp() throws Exception {
super.setUp();
dir = new RAMDirectory();
appAnalyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
appAnalyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, appAnalyzer));
int numDocs = 200;
for (int i = 0; i < numDocs; i++) {
@ -159,9 +161,9 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
if (++invocationCount % 2 == 0)
return new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
return new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
else
return new LetterTokenizer(TEST_VERSION_CURRENT, reader);
return new MockTokenizer(reader, MockTokenizer.SIMPLE, false);
}
}
@ -175,7 +177,7 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
}
public void testTokenStream() throws Exception {
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
a.addStopWords(reader, 10);
TokenStream ts = a.tokenStream("repetitiveField", new StringReader("this boring"));
assertTokenStreamContents(ts, new String[] { "this" });

View File

@ -19,22 +19,22 @@ package org.apache.lucene.analysis.reverse;
import java.io.StringReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.Version;
public class TestReverseStringFilter extends BaseTokenStreamTestCase {
public void testFilter() throws Exception {
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
new StringReader("Do have a nice day")); // 1-4 length string
TokenStream stream = new MockTokenizer(new StringReader("Do have a nice day"),
MockTokenizer.WHITESPACE, false); // 1-4 length string
ReverseStringFilter filter = new ReverseStringFilter(TEST_VERSION_CURRENT, stream);
assertTokenStreamContents(filter, new String[] { "oD", "evah", "a", "ecin", "yad" });
}
public void testFilterWithMark() throws Exception {
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
"Do have a nice day")); // 1-4 length string
TokenStream stream = new MockTokenizer(new StringReader("Do have a nice day"),
MockTokenizer.WHITESPACE, false); // 1-4 length string
ReverseStringFilter filter = new ReverseStringFilter(TEST_VERSION_CURRENT, stream, '\u0001');
assertTokenStreamContents(filter,
new String[] { "\u0001oD", "\u0001evah", "\u0001a", "\u0001ecin", "\u0001yad" });

View File

@ -22,8 +22,8 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
@ -36,7 +36,7 @@ public class TestRussianLightStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new RussianLightStemFilter(source));
}
};

View File

@ -22,10 +22,9 @@ import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
@ -106,7 +105,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
*/
public void testShingleAnalyzerWrapperQueryParsing() throws Exception {
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2),
(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2),
"test sentence");
int[] ranks = new int[] { 1, 2, 0 };
compareRanks(hits, ranks);
@ -117,7 +116,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
*/
public void testShingleAnalyzerWrapperPhraseQueryParsingFails() throws Exception {
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2),
(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2),
"\"this sentence\"");
int[] ranks = new int[] { 0 };
compareRanks(hits, ranks);
@ -128,7 +127,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
*/
public void testShingleAnalyzerWrapperPhraseQueryParsing() throws Exception {
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2),
(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2),
"\"test sentence\"");
int[] ranks = new int[] { 1 };
compareRanks(hits, ranks);
@ -139,7 +138,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
*/
public void testShingleAnalyzerWrapperRequiredQueryParsing() throws Exception {
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2),
(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2),
"+test +sentence");
int[] ranks = new int[] { 1, 2 };
compareRanks(hits, ranks);
@ -149,7 +148,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
* This shows how to construct a phrase query containing shingles.
*/
public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2);
Analyzer analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2);
searcher = setUpSearcher(analyzer);
PhraseQuery q = new PhraseQuery();
@ -161,6 +160,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
j += posIncrAtt.getPositionIncrement();
String termText = termAtt.toString();
@ -178,7 +178,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
* in the right order and adjacent to each other.
*/
public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2);
Analyzer analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2);
searcher = setUpSearcher(analyzer);
BooleanQuery q = new BooleanQuery();
@ -188,6 +188,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
String termText = termAtt.toString();
q.add(new TermQuery(new Term("content", termText)),
@ -200,7 +202,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
}
public void testReusableTokenStream() throws Exception {
Analyzer a = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2);
Analyzer a = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2);
assertAnalyzesToReuse(a, "please divide into shingles",
new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" },
new int[] { 0, 0, 7, 7, 14, 14, 19 },
@ -222,9 +224,9 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
if (++invocationCount % 2 == 0)
return new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
return new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
else
return new LetterTokenizer(TEST_VERSION_CURRENT, reader);
return new MockTokenizer(reader, MockTokenizer.SIMPLE, false);
}
}
@ -249,7 +251,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
public void testNonDefaultMinShingleSize() throws Exception {
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 3, 4);
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 3, 4);
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
new String[] { "please", "please divide this", "please divide this sentence",
"divide", "divide this sentence", "divide this sentence into",
@ -273,7 +275,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
public void testNonDefaultMinAndSameMaxShingleSize() throws Exception {
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 3, 3);
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 3, 3);
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
new String[] { "please", "please divide this",
"divide", "divide this sentence",
@ -297,7 +299,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
public void testNoTokenSeparator() throws Exception {
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
analyzer.setTokenSeparator("");
assertAnalyzesToReuse(analyzer, "please divide into shingles",
new String[] { "please", "pleasedivide",
@ -319,7 +321,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
public void testNullTokenSeparator() throws Exception {
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
analyzer.setTokenSeparator(null);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
new String[] { "please", "pleasedivide",
@ -340,7 +342,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
}
public void testAltTokenSeparator() throws Exception {
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
analyzer.setTokenSeparator("<SEP>");
assertAnalyzesToReuse(analyzer, "please divide into shingles",
new String[] { "please", "please<SEP>divide",
@ -362,7 +364,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
analyzer.setOutputUnigrams(false);
analyzer.setOutputUnigramsIfNoShingles(true);
assertAnalyzesToReuse(analyzer, "please",

View File

@ -22,14 +22,14 @@ import java.text.SimpleDateFormat;
import java.util.Locale;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.MockTokenizer;
public class DateRecognizerSinkTokenizerTest extends BaseTokenStreamTestCase {
public void test() throws IOException {
DateRecognizerSinkFilter sinkFilter = new DateRecognizerSinkFilter(new SimpleDateFormat("MM/dd/yyyy", Locale.US));
String test = "The quick red fox jumped over the lazy brown dogs on 7/11/2006 The dogs finally reacted on 7/12/2006";
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)));
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false));
TeeSinkTokenFilter.SinkTokenStream sink = tee.newSinkTokenStream(sinkFilter);
int count = 0;

View File

@ -84,7 +84,7 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
// with BaseTokenStreamTestCase now...
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
Directory dir = newDirectory();
Analyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
Analyzer analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
Document doc = new Document();
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(analyzer.tokenStream("field", new StringReader("abcd ")));
@ -108,7 +108,7 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
}
public void testGeneral() throws IOException {
final TeeSinkTokenFilter source = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer1.toString())));
final TeeSinkTokenFilter source = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
final TokenStream sink1 = source.newSinkTokenStream();
final TokenStream sink2 = source.newSinkTokenStream(theFilter);
@ -122,16 +122,17 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
}
public void testMultipleSources() throws Exception {
final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer1.toString())));
final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
tee1.reset();
final TokenStream source1 = new CachingTokenFilter(tee1);
tee1.addAttribute(CheckClearAttributesAttribute.class);
dogDetector.addAttribute(CheckClearAttributesAttribute.class);
theDetector.addAttribute(CheckClearAttributesAttribute.class);
final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer2.toString())));
final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.toString()), MockTokenizer.WHITESPACE, false));
tee2.addSinkTokenStream(dogDetector);
tee2.addSinkTokenStream(theDetector);
final TokenStream source2 = tee2;

View File

@ -20,14 +20,14 @@ import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.MockTokenizer;
public class TokenRangeSinkTokenizerTest extends BaseTokenStreamTestCase {
public void test() throws IOException {
TokenRangeSinkFilter sinkFilter = new TokenRangeSinkFilter(2, 4);
String test = "The quick red fox jumped over the lazy brown dogs";
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)));
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false));
TeeSinkTokenFilter.SinkTokenStream rangeToks = tee.newSinkTokenStream(sinkFilter);
int count = 0;

View File

@ -20,9 +20,9 @@ import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
@ -32,7 +32,7 @@ public class TokenTypeSinkTokenizerTest extends BaseTokenStreamTestCase {
TokenTypeSinkFilter sinkFilter = new TokenTypeSinkFilter("D");
String test = "The quick red fox jumped over the lazy brown dogs";
TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test))));
TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false)));
TeeSinkTokenFilter.SinkTokenStream sink = ttf.newSinkTokenStream(sinkFilter);
boolean seenDogs = false;

View File

@ -22,8 +22,8 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
@ -36,7 +36,7 @@ public class TestSwedishLightStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new SwedishLightStemFilter(source));
}
};

View File

@ -25,6 +25,7 @@ import java.util.Collection;
import java.util.List;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@ -43,14 +44,14 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
static void assertTokenizesTo(SynonymMap dict, String input,
String expected[]) throws IOException {
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected);
}
static void assertTokenizesTo(SynonymMap dict, String input,
String expected[], int posIncs[]) throws IOException {
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, posIncs);
}

View File

@ -20,8 +20,8 @@ package org.apache.lucene.analysis.tr;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Test the Turkish lowercase filter.
@ -32,8 +32,8 @@ public class TestTurkishLowerCaseFilter extends BaseTokenStreamTestCase {
* Test composed forms
*/
public void testTurkishLowerCaseFilter() throws Exception {
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
"\u0130STANBUL \u0130ZM\u0130R ISPARTA"));
TokenStream stream = new MockTokenizer(new StringReader(
"\u0130STANBUL \u0130ZM\u0130R ISPARTA"), MockTokenizer.WHITESPACE, false);
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
assertTokenStreamContents(filter, new String[] {"istanbul", "izmir",
"\u0131sparta",});
@ -43,8 +43,8 @@ public class TestTurkishLowerCaseFilter extends BaseTokenStreamTestCase {
* Test decomposed forms
*/
public void testDecomposed() throws Exception {
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
"\u0049\u0307STANBUL \u0049\u0307ZM\u0049\u0307R ISPARTA"));
TokenStream stream = new MockTokenizer(new StringReader(
"\u0049\u0307STANBUL \u0049\u0307ZM\u0049\u0307R ISPARTA"), MockTokenizer.WHITESPACE, false);
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
assertTokenStreamContents(filter, new String[] {"istanbul", "izmir",
"\u0131sparta",});
@ -56,8 +56,8 @@ public class TestTurkishLowerCaseFilter extends BaseTokenStreamTestCase {
* to U+0130 + U+0316, and is lowercased the same way.
*/
public void testDecomposed2() throws Exception {
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
"\u0049\u0316\u0307STANBUL \u0049\u0307ZM\u0049\u0307R I\u0316SPARTA"));
TokenStream stream = new MockTokenizer(new StringReader(
"\u0049\u0316\u0307STANBUL \u0049\u0307ZM\u0049\u0307R I\u0316SPARTA"), MockTokenizer.WHITESPACE, false);
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
assertTokenStreamContents(filter, new String[] {"i\u0316stanbul", "izmir",
"\u0131\u0316sparta",});