mirror of https://github.com/apache/lucene.git
LUCENE-3113: fix analyzer bugs found by MockTokenizer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1104519 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
85f38eb661
commit
f1a1844fb9
|
@ -75,6 +75,12 @@ Bug Fixes
|
|||
caused a problem if you consumed a tokenstream, then reused it, added different
|
||||
attributes to it, and consumed it again. (Robert Muir, Uwe Schindler)
|
||||
|
||||
* LUCENE-3113: Fixed some minor analysis bugs: double-reset() in ReusableAnalyzerBase
|
||||
and ShingleAnalyzerWrapper, missing end() implementations in PrefixAwareTokenFilter
|
||||
and PrefixAndSuffixAwareTokenFilter, invocations of incrementToken() after it
|
||||
already returned false in CommonGramsQueryFilter, HyphenatedWordsFilter,
|
||||
ShingleFilter, and SynonymsFilter. (Robert Muir, Steven Rowe, Uwe Schindler)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-3016: Add analyzer for Latvian. (Robert Muir)
|
||||
|
|
|
@ -100,7 +100,7 @@ public class MockTokenizer extends Tokenizer {
|
|||
endOffset = off;
|
||||
cp = readCodePoint();
|
||||
} while (cp >= 0 && isTokenChar(cp));
|
||||
offsetAtt.setOffset(startOffset, endOffset);
|
||||
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset));
|
||||
streamState = State.INCREMENT;
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -49,6 +49,7 @@ public final class CommonGramsQueryFilter extends TokenFilter {
|
|||
|
||||
private State previous;
|
||||
private String previousType;
|
||||
private boolean exhausted;
|
||||
|
||||
/**
|
||||
* Constructs a new CommonGramsQueryFilter based on the provided CommomGramsFilter
|
||||
|
@ -67,6 +68,7 @@ public final class CommonGramsQueryFilter extends TokenFilter {
|
|||
super.reset();
|
||||
previous = null;
|
||||
previousType = null;
|
||||
exhausted = false;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -79,7 +81,7 @@ public final class CommonGramsQueryFilter extends TokenFilter {
|
|||
*/
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (input.incrementToken()) {
|
||||
while (!exhausted && input.incrementToken()) {
|
||||
State current = captureState();
|
||||
|
||||
if (previous != null && !isGramType()) {
|
||||
|
@ -96,6 +98,8 @@ public final class CommonGramsQueryFilter extends TokenFilter {
|
|||
previous = current;
|
||||
}
|
||||
|
||||
exhausted = true;
|
||||
|
||||
if (previous == null || GRAM_TYPE.equals(previousType)) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -59,6 +59,7 @@ public final class HyphenatedWordsFilter extends TokenFilter {
|
|||
|
||||
private final StringBuilder hyphenated = new StringBuilder();
|
||||
private State savedState;
|
||||
private boolean exhausted = false;
|
||||
|
||||
/**
|
||||
* Creates a new HyphenatedWordsFilter
|
||||
|
@ -74,7 +75,7 @@ public final class HyphenatedWordsFilter extends TokenFilter {
|
|||
*/
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (input.incrementToken()) {
|
||||
while (!exhausted && input.incrementToken()) {
|
||||
char[] term = termAttribute.buffer();
|
||||
int termLength = termAttribute.length();
|
||||
|
||||
|
@ -96,6 +97,8 @@ public final class HyphenatedWordsFilter extends TokenFilter {
|
|||
}
|
||||
}
|
||||
|
||||
exhausted = true;
|
||||
|
||||
if (savedState != null) {
|
||||
// the final term ends with a hyphen
|
||||
// add back the hyphen, for backwards compatibility.
|
||||
|
@ -115,6 +118,7 @@ public final class HyphenatedWordsFilter extends TokenFilter {
|
|||
super.reset();
|
||||
hyphenated.setLength(0);
|
||||
savedState = null;
|
||||
exhausted = false;
|
||||
}
|
||||
|
||||
// ================================================= Helper Methods ================================================
|
||||
|
|
|
@ -76,4 +76,9 @@ public class PrefixAndSuffixAwareTokenFilter extends TokenStream {
|
|||
public void close() throws IOException {
|
||||
suffix.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
suffix.end();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -158,6 +158,12 @@ public class PrefixAwareTokenFilter extends TokenStream {
|
|||
return suffixToken;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
prefix.end();
|
||||
suffix.end();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
prefix.close();
|
||||
|
|
|
@ -225,7 +225,6 @@ public final class QueryAutoStopWordAnalyzer extends Analyzer {
|
|||
TokenStream result = delegate.reusableTokenStream(fieldName, reader);
|
||||
if (result == streams.wrapped) {
|
||||
/* the wrapped analyzer reused the stream */
|
||||
streams.withStopFilter.reset();
|
||||
} else {
|
||||
/*
|
||||
* the wrapped analyzer did not. if there are any stopwords for the
|
||||
|
|
|
@ -199,10 +199,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
|
|||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
TokenStream result = defaultAnalyzer.reusableTokenStream(fieldName, reader);
|
||||
if (result == streams.wrapped) {
|
||||
/* the wrapped analyzer reused the stream */
|
||||
streams.shingle.reset();
|
||||
} else {
|
||||
if (result != streams.wrapped) {
|
||||
/* the wrapped analyzer did not, create a new shingle around the new one */
|
||||
streams.wrapped = result;
|
||||
streams.shingle = new ShingleFilter(streams.wrapped);
|
||||
|
|
|
@ -327,6 +327,8 @@ public final class ShingleFilter extends TokenFilter {
|
|||
return tokenAvailable;
|
||||
}
|
||||
|
||||
private boolean exhausted;
|
||||
|
||||
/**
|
||||
* <p>Get the next token from the input stream.
|
||||
* <p>If the next token has <code>positionIncrement > 1</code>,
|
||||
|
@ -359,7 +361,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
}
|
||||
isNextInputStreamToken = false;
|
||||
newTarget.isFiller = false;
|
||||
} else if (input.incrementToken()) {
|
||||
} else if (!exhausted && input.incrementToken()) {
|
||||
if (null == target) {
|
||||
newTarget = new InputWindowToken(cloneAttributes());
|
||||
} else {
|
||||
|
@ -387,6 +389,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
}
|
||||
} else {
|
||||
newTarget = null;
|
||||
exhausted = true;
|
||||
}
|
||||
return newTarget;
|
||||
}
|
||||
|
@ -435,7 +438,8 @@ public final class ShingleFilter extends TokenFilter {
|
|||
inputWindow.clear();
|
||||
numFillerTokensToInsert = 0;
|
||||
isOutputHere = false;
|
||||
noShingleOutput = true;
|
||||
noShingleOutput = true;
|
||||
exhausted = false;
|
||||
if (outputUnigramsIfNoShingles && ! outputUnigrams) {
|
||||
// Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
|
||||
gramSize.minValue = minShingleSize;
|
||||
|
|
|
@ -190,14 +190,18 @@ public final class SynonymFilter extends TokenFilter {
|
|||
private LinkedList<AttributeSource> buffer;
|
||||
private LinkedList<AttributeSource> matched;
|
||||
|
||||
private boolean exhausted;
|
||||
|
||||
private AttributeSource nextTok() throws IOException {
|
||||
if (buffer!=null && !buffer.isEmpty()) {
|
||||
return buffer.removeFirst();
|
||||
} else {
|
||||
if (input.incrementToken()) {
|
||||
if (!exhausted && input.incrementToken()) {
|
||||
return this;
|
||||
} else
|
||||
} else {
|
||||
exhausted = true;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -250,5 +254,6 @@ public final class SynonymFilter extends TokenFilter {
|
|||
public void reset() throws IOException {
|
||||
input.reset();
|
||||
replacement = null;
|
||||
exhausted = false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -159,8 +159,6 @@ public abstract class ReusableAnalyzerBase extends Analyzer {
|
|||
*/
|
||||
protected boolean reset(final Reader reader) throws IOException {
|
||||
source.reset(reader);
|
||||
if(sink != source)
|
||||
sink.reset(); // only reset if the sink reference is different from source
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
|||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
@ -215,8 +215,7 @@ public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
|
|||
public void testWithKeywordAttribute() throws IOException {
|
||||
CharArraySet set = new CharArraySet(Version.LUCENE_31, 1, true);
|
||||
set.add("строеве");
|
||||
WhitespaceTokenizer tokenStream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||
new StringReader("строевете строеве"));
|
||||
MockTokenizer tokenStream = new MockTokenizer(new StringReader("строевете строеве"), MockTokenizer.WHITESPACE, false);
|
||||
|
||||
BulgarianStemFilter filter = new BulgarianStemFilter(
|
||||
new KeywordMarkerFilter(tokenStream, set));
|
||||
|
|
|
@ -22,8 +22,8 @@ import java.io.StringReader;
|
|||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
|
||||
public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
|
@ -64,55 +64,55 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testNothingChange() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1});
|
||||
}
|
||||
|
||||
public void test1to1() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "h" ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1});
|
||||
}
|
||||
|
||||
public void test1to2() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "j" ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1});
|
||||
}
|
||||
|
||||
public void test1to3() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "k" ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1});
|
||||
}
|
||||
|
||||
public void test2to4() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "ll" ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2});
|
||||
}
|
||||
|
||||
public void test2to1() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "aa" ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2});
|
||||
}
|
||||
|
||||
public void test3to1() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3});
|
||||
}
|
||||
|
||||
public void test4to2() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4});
|
||||
}
|
||||
|
||||
public void test5to0() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "empty" ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts, new String[0]);
|
||||
}
|
||||
|
||||
|
@ -136,7 +136,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
|||
//
|
||||
public void testTokenStream() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "h i j k ll cccc bbb aa" ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[]{"i","i","jj","kkk","llll","cc","b","a"},
|
||||
new int[]{0,2,4,6,8,11,16,20},
|
||||
|
@ -157,7 +157,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
|||
public void testChained() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap,
|
||||
new MappingCharFilter( normMap, CharReader.get( new StringReader( "aaaa ll h" ) ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( TEST_VERSION_CURRENT, cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[]{"a","llllllll","i"},
|
||||
new int[]{0,5,8},
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.StringReader;
|
|||
import java.util.Arrays;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
|
@ -90,7 +91,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
public TokenStream tokenStream(String field, Reader in) {
|
||||
return new CommonGramsQueryFilter(new CommonGramsFilter(TEST_VERSION_CURRENT,
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, in), commonWords));
|
||||
new MockTokenizer(in, MockTokenizer.WHITESPACE, false), commonWords));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -159,7 +160,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
public TokenStream tokenStream(String field, Reader in) {
|
||||
return new CommonGramsFilter(TEST_VERSION_CURRENT,
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, in), commonWords);
|
||||
new MockTokenizer(in, MockTokenizer.WHITESPACE, false), commonWords);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -245,7 +246,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
*/
|
||||
public void testCaseSensitive() throws Exception {
|
||||
final String input = "How The s a brown s cow d like A B thing?";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
MockTokenizer wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
TokenFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords);
|
||||
assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
|
||||
"s_a", "a", "a_brown", "brown", "brown_s", "s", "s_cow", "cow",
|
||||
|
@ -257,7 +258,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
*/
|
||||
public void testLastWordisStopWord() throws Exception {
|
||||
final String input = "dog the";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
MockTokenizer wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "dog_the" });
|
||||
|
@ -268,7 +269,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
*/
|
||||
public void testFirstWordisStopWord() throws Exception {
|
||||
final String input = "the dog";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
MockTokenizer wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "the_dog" });
|
||||
|
@ -279,7 +280,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
*/
|
||||
public void testOneWordQueryStopWord() throws Exception {
|
||||
final String input = "the";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
MockTokenizer wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "the" });
|
||||
|
@ -290,7 +291,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
*/
|
||||
public void testOneWordQuery() throws Exception {
|
||||
final String input = "monster";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
MockTokenizer wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "monster" });
|
||||
|
@ -301,7 +302,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
*/
|
||||
public void TestFirstAndLastStopWord() throws Exception {
|
||||
final String input = "the of";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
MockTokenizer wt = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "the_of" });
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.StringReader;
|
|||
import org.xml.sax.InputSource;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
|
@ -35,8 +36,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
.getHyphenationTree(is);
|
||||
|
||||
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||
"min veninde som er lidt af en læsehest")), hyphenator,
|
||||
new MockTokenizer(new StringReader("min veninde som er lidt af en læsehest"), MockTokenizer.WHITESPACE, false),
|
||||
hyphenator,
|
||||
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
|
@ -55,8 +56,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
// the word basket will not be added due to the longest match option
|
||||
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||
"basketballkurv")), hyphenator, dict,
|
||||
new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
|
||||
hyphenator, dict,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
|
||||
assertTokenStreamContents(tf,
|
||||
|
@ -77,7 +78,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
|
||||
TEST_VERSION_CURRENT,
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
|
||||
new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
|
||||
hyphenator,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
2, 4);
|
||||
|
@ -89,7 +90,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
tf = new HyphenationCompoundWordTokenFilter(
|
||||
TEST_VERSION_CURRENT,
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
|
||||
new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
|
||||
hyphenator,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
4, 6);
|
||||
|
@ -101,7 +102,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
tf = new HyphenationCompoundWordTokenFilter(
|
||||
TEST_VERSION_CURRENT,
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
|
||||
new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
|
||||
hyphenator,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
4, 10);
|
||||
|
@ -120,9 +121,10 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
"Sko", "Vind", "Rute", "Torkare", "Blad" };
|
||||
|
||||
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||
new MockTokenizer(
|
||||
new StringReader(
|
||||
"Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
|
||||
"Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba"),
|
||||
MockTokenizer.WHITESPACE, false),
|
||||
dict);
|
||||
|
||||
assertTokenStreamContents(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor",
|
||||
|
@ -149,7 +151,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
"Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral" };
|
||||
|
||||
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("Basfiolsfodralmakaregesäll")),
|
||||
new MockTokenizer(new StringReader("Basfiolsfodralmakaregesäll"), MockTokenizer.WHITESPACE, false),
|
||||
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true);
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.util.ArrayList;
|
|||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
@ -36,36 +37,23 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
|||
public void testExactCase() throws IOException {
|
||||
StringReader reader = new StringReader("Now is The Time");
|
||||
Set<String> stopWords = asSet("is", "the", "Time");
|
||||
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), stopWords, false);
|
||||
final CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals("Now", termAtt.toString());
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals("The", termAtt.toString());
|
||||
assertFalse(stream.incrementToken());
|
||||
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords, false);
|
||||
assertTokenStreamContents(stream, new String[] { "Now", "The" });
|
||||
}
|
||||
|
||||
public void testIgnoreCase() throws IOException {
|
||||
StringReader reader = new StringReader("Now is The Time");
|
||||
Set<String> stopWords = asSet( "is", "the", "Time" );
|
||||
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), stopWords, true);
|
||||
final CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals("Now", termAtt.toString());
|
||||
assertFalse(stream.incrementToken());
|
||||
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords, true);
|
||||
assertTokenStreamContents(stream, new String[] { "Now" });
|
||||
}
|
||||
|
||||
public void testStopFilt() throws IOException {
|
||||
StringReader reader = new StringReader("Now is The Time");
|
||||
String[] stopWords = new String[] { "is", "the", "Time" };
|
||||
Set<Object> stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords);
|
||||
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
|
||||
final CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals("Now", termAtt.toString());
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals("The", termAtt.toString());
|
||||
assertFalse(stream.incrementToken());
|
||||
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
|
||||
assertTokenStreamContents(stream, new String[] { "Now", "The" });
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -85,11 +73,11 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
|||
Set<Object> stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords);
|
||||
// with increments
|
||||
StringReader reader = new StringReader(sb.toString());
|
||||
StopFilter stpf = new StopFilter(Version.LUCENE_40, new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
|
||||
StopFilter stpf = new StopFilter(Version.LUCENE_40, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
|
||||
doTestStopPositons(stpf,true);
|
||||
// without increments
|
||||
reader = new StringReader(sb.toString());
|
||||
stpf = new StopFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
|
||||
stpf = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
|
||||
doTestStopPositons(stpf,false);
|
||||
// with increments, concatenating two stop filters
|
||||
ArrayList<String> a0 = new ArrayList<String>();
|
||||
|
@ -108,7 +96,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
|||
Set<Object> stopSet0 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords0);
|
||||
Set<Object> stopSet1 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords1);
|
||||
reader = new StringReader(sb.toString());
|
||||
StopFilter stpf0 = new StopFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), stopSet0); // first part of the set
|
||||
StopFilter stpf0 = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet0); // first part of the set
|
||||
stpf0.setEnablePositionIncrements(true);
|
||||
StopFilter stpf01 = new StopFilter(TEST_VERSION_CURRENT, stpf0, stopSet1); // two stop filters concatenated!
|
||||
doTestStopPositons(stpf01,true);
|
||||
|
@ -119,6 +107,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
|||
stpf.setEnablePositionIncrements(enableIcrements);
|
||||
CharTermAttribute termAtt = stpf.getAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class);
|
||||
stpf.reset();
|
||||
for (int i=0; i<20; i+=3) {
|
||||
assertTrue(stpf.incrementToken());
|
||||
log("Token "+i+": "+stpf);
|
||||
|
@ -127,6 +116,8 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
|||
assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,posIncrAtt.getPositionIncrement());
|
||||
}
|
||||
assertFalse(stpf.incrementToken());
|
||||
stpf.end();
|
||||
stpf.close();
|
||||
}
|
||||
|
||||
// print debug info depending on VERBOSE
|
||||
|
|
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
|||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
|
@ -278,7 +278,7 @@ public class TestCzechStemmer extends BaseTokenStreamTestCase {
|
|||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||
set.add("hole");
|
||||
CzechStemFilter filter = new CzechStemFilter(new KeywordMarkerFilter(
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("hole desek")), set));
|
||||
new MockTokenizer(new StringReader("hole desek"), MockTokenizer.WHITESPACE, false), set));
|
||||
assertTokenStreamContents(filter, new String[] { "hole", "desk" });
|
||||
}
|
||||
|
||||
|
|
|
@ -22,8 +22,8 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||
|
@ -36,7 +36,7 @@ public class TestGermanLightStemFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new GermanLightStemFilter(source));
|
||||
}
|
||||
};
|
||||
|
|
|
@ -22,8 +22,8 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||
|
@ -36,7 +36,7 @@ public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new GermanMinimalStemFilter(source));
|
||||
}
|
||||
};
|
||||
|
|
|
@ -22,8 +22,8 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
/**
|
||||
|
@ -34,7 +34,7 @@ public class TestEnglishMinimalStemFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new EnglishMinimalStemFilter(source));
|
||||
}
|
||||
};
|
||||
|
|
|
@ -22,12 +22,11 @@ import java.io.Reader;
|
|||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
|
@ -41,7 +40,7 @@ public class TestPorterStemFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer t = new KeywordTokenizer(reader);
|
||||
Tokenizer t = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
|
||||
return new TokenStreamComponents(t, new PorterStemFilter(t));
|
||||
}
|
||||
};
|
||||
|
@ -57,7 +56,7 @@ public class TestPorterStemFilter extends BaseTokenStreamTestCase {
|
|||
public void testWithKeywordAttribute() throws IOException {
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||
set.add("yourselves");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("yourselves yours"));
|
||||
Tokenizer tokenizer = new MockTokenizer(new StringReader("yourselves yours"), MockTokenizer.WHITESPACE, false);
|
||||
TokenStream filter = new PorterStemFilter(new KeywordMarkerFilter(tokenizer, set));
|
||||
assertTokenStreamContents(filter, new String[] {"yourselves", "your"});
|
||||
}
|
||||
|
|
|
@ -22,8 +22,8 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||
|
@ -36,7 +36,7 @@ public class TestSpanishLightStemFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new SpanishLightStemFilter(source));
|
||||
}
|
||||
};
|
||||
|
|
|
@ -22,8 +22,8 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||
|
@ -36,7 +36,7 @@ public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new FinnishLightStemFilter(source));
|
||||
}
|
||||
};
|
||||
|
|
|
@ -22,8 +22,8 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||
|
@ -36,7 +36,7 @@ public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new FrenchLightStemFilter(source));
|
||||
}
|
||||
};
|
||||
|
|
|
@ -22,8 +22,8 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||
|
@ -36,7 +36,7 @@ public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new FrenchMinimalStemFilter(source));
|
||||
}
|
||||
};
|
||||
|
|
|
@ -21,9 +21,9 @@ import java.io.IOException;
|
|||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Test HindiNormalizer
|
||||
|
@ -59,8 +59,7 @@ public class TestHindiNormalizer extends BaseTokenStreamTestCase {
|
|||
check("आईऊॠॡऐऔीूॄॣैौ", "अइउऋऌएओिुृॢेो");
|
||||
}
|
||||
private void check(String input, String output) throws IOException {
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||
new StringReader(input));
|
||||
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
TokenFilter tf = new HindiNormalizationFilter(tokenizer);
|
||||
assertTokenStreamContents(tf, new String[] { output });
|
||||
}
|
||||
|
|
|
@ -21,9 +21,9 @@ import java.io.IOException;
|
|||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Test HindiStemmer
|
||||
|
@ -81,8 +81,7 @@ public class TestHindiStemmer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
private void check(String input, String output) throws IOException {
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||
new StringReader(input));
|
||||
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
TokenFilter tf = new HindiStemFilter(tokenizer);
|
||||
assertTokenStreamContents(tf, new String[] { output });
|
||||
}
|
||||
|
|
|
@ -22,8 +22,8 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||
|
@ -36,7 +36,7 @@ public class TestHungarianLightStemFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new HungarianLightStemFilter(source));
|
||||
}
|
||||
};
|
||||
|
|
|
@ -21,9 +21,9 @@ import java.io.IOException;
|
|||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Test IndicNormalizer
|
||||
|
@ -44,8 +44,7 @@ public class TestIndicNormalizer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
private void check(String input, String output) throws IOException {
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||
new StringReader(input));
|
||||
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);;
|
||||
TokenFilter tf = new IndicNormalizationFilter(tokenizer);
|
||||
assertTokenStreamContents(tf, new String[] { output });
|
||||
}
|
||||
|
|
|
@ -22,8 +22,8 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||
|
@ -36,7 +36,7 @@ public class TestItalianLightStemFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new ItalianLightStemFilter(source));
|
||||
}
|
||||
};
|
||||
|
|
|
@ -22,8 +22,8 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
/**
|
||||
|
@ -33,7 +33,7 @@ public class TestLatvianStemmer extends BaseTokenStreamTestCase {
|
|||
private Analyzer a = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new LatvianStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
@ -30,14 +31,14 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
// testLain1Accents() is a copy of TestLatin1AccentFilter.testU().
|
||||
public void testLatin1Accents() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader
|
||||
TokenStream stream = new MockTokenizer(new StringReader
|
||||
("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ"
|
||||
+" Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij"
|
||||
+" ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"));
|
||||
+" ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"), MockTokenizer.WHITESPACE, false);
|
||||
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream);
|
||||
|
||||
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||
|
||||
filter.reset();
|
||||
assertTermEquals("Des", filter, termAtt);
|
||||
assertTermEquals("mot", filter, termAtt);
|
||||
assertTermEquals("cles", filter, termAtt);
|
||||
|
@ -1891,10 +1892,11 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
|
|||
expectedOutputTokens.add(expected.toString());
|
||||
}
|
||||
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(inputText.toString()));
|
||||
TokenStream stream = new MockTokenizer(new StringReader(inputText.toString()), MockTokenizer.WHITESPACE, false);
|
||||
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream);
|
||||
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||
Iterator<String> expectedIter = expectedOutputTokens.iterator();
|
||||
filter.reset();
|
||||
while (expectedIter.hasNext()) {
|
||||
assertTermEquals(expectedIter.next(), filter, termAtt);
|
||||
}
|
||||
|
|
|
@ -25,9 +25,8 @@ import java.util.Collection;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.miscellaneous.CapitalizationFilter.*;
|
||||
|
@ -105,7 +104,7 @@ public class TestCapitalizationFilter extends BaseTokenStreamTestCase {
|
|||
boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
|
||||
Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
|
||||
int maxTokenLength) throws IOException {
|
||||
assertCapitalizesTo(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
|
||||
assertCapitalizesTo(new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false),
|
||||
expected, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength,
|
||||
maxWordCount, maxTokenLength);
|
||||
}
|
||||
|
@ -114,7 +113,7 @@ public class TestCapitalizationFilter extends BaseTokenStreamTestCase {
|
|||
boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
|
||||
Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
|
||||
int maxTokenLength) throws IOException {
|
||||
assertCapitalizesTo(new KeywordTokenizer(new StringReader(input)),
|
||||
assertCapitalizesTo(new MockTokenizer(new StringReader(input), MockTokenizer.KEYWORD, false),
|
||||
new String[] { expected }, onlyFirstWord, keep, forceFirstLetter, okPrefix,
|
||||
minWordLength, maxWordCount, maxTokenLength);
|
||||
}
|
||||
|
|
|
@ -20,8 +20,8 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* HyphenatedWordsFilter test
|
||||
|
@ -30,7 +30,7 @@ public class TestHyphenatedWordsFilter extends BaseTokenStreamTestCase {
|
|||
public void testHyphenatedWords() throws Exception {
|
||||
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
|
||||
// first test
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
TokenStream ts = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
ts = new HyphenatedWordsFilter(ts);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecological" });
|
||||
|
@ -42,7 +42,7 @@ public class TestHyphenatedWordsFilter extends BaseTokenStreamTestCase {
|
|||
public void testHyphenAtEnd() throws Exception {
|
||||
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-";
|
||||
// first test
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
TokenStream ts = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
ts = new HyphenatedWordsFilter(ts);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecology-" });
|
||||
|
|
|
@ -22,8 +22,8 @@ import java.util.HashSet;
|
|||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/** Test {@link KeepWordFilter} */
|
||||
|
@ -38,22 +38,22 @@ public class TestKeepWordFilter extends BaseTokenStreamTestCase {
|
|||
String input = "xxx yyy aaa zzz BBB ccc ddd EEE";
|
||||
|
||||
// Test Stopwords
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
TokenStream stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 });
|
||||
|
||||
// Now force case
|
||||
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 });
|
||||
|
||||
// Test Stopwords
|
||||
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 1, 1 });
|
||||
|
||||
// Now force case
|
||||
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 });
|
||||
}
|
||||
|
|
|
@ -8,9 +8,9 @@ import java.util.Locale;
|
|||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
@ -45,17 +45,17 @@ public class TestKeywordMarkerFilter extends BaseTokenStreamTestCase {
|
|||
String[] output = new String[] { "the", "quick", "brown", "LuceneFox",
|
||||
"jumps" };
|
||||
assertTokenStreamContents(new LowerCaseFilterMock(
|
||||
new KeywordMarkerFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||
"The quIck browN LuceneFox Jumps")), set)), output);
|
||||
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
|
||||
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set)), output);
|
||||
Set<String> jdkSet = new HashSet<String>();
|
||||
jdkSet.add("LuceneFox");
|
||||
assertTokenStreamContents(new LowerCaseFilterMock(
|
||||
new KeywordMarkerFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||
"The quIck browN LuceneFox Jumps")), jdkSet)), output);
|
||||
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
|
||||
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), jdkSet)), output);
|
||||
Set<?> set2 = set;
|
||||
assertTokenStreamContents(new LowerCaseFilterMock(
|
||||
new KeywordMarkerFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||
"The quIck browN LuceneFox Jumps")), set2)), output);
|
||||
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
|
||||
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set2)), output);
|
||||
}
|
||||
|
||||
// LUCENE-2901
|
||||
|
@ -63,8 +63,7 @@ public class TestKeywordMarkerFilter extends BaseTokenStreamTestCase {
|
|||
TokenStream ts = new LowerCaseFilterMock(
|
||||
new KeywordMarkerFilter(
|
||||
new KeywordMarkerFilter(
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||
new StringReader("Dogs Trees Birds Houses")),
|
||||
new MockTokenizer(new StringReader("Dogs Trees Birds Houses"), MockTokenizer.WHITESPACE, false),
|
||||
new HashSet<String>(Arrays.asList(new String[] { "Birds", "Houses" }))),
|
||||
new HashSet<String>(Arrays.asList(new String[] { "Dogs", "Trees" }))));
|
||||
|
||||
|
|
|
@ -18,15 +18,13 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import java.io.StringReader;
|
||||
|
||||
public class TestLengthFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testFilterNoPosIncr() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||
new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
|
||||
TokenStream stream = new MockTokenizer(
|
||||
new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false);
|
||||
LengthFilter filter = new LengthFilter(false, stream, 2, 6);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[]{"short", "ab", "foo"},
|
||||
|
@ -35,8 +33,8 @@ public class TestLengthFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testFilterWithPosIncr() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||
new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
|
||||
TokenStream stream = new MockTokenizer(
|
||||
new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false);
|
||||
LengthFilter filter = new LengthFilter(true, stream, 2, 6);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[]{"short", "ab", "foo"},
|
||||
|
|
|
@ -18,8 +18,8 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
@ -30,7 +30,7 @@ public class TestPrefixAndSuffixAwareTokenFilter extends BaseTokenStreamTestCase
|
|||
|
||||
PrefixAndSuffixAwareTokenFilter ts = new PrefixAndSuffixAwareTokenFilter(
|
||||
new SingleTokenTokenStream(createToken("^", 0, 0)),
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("hello world")),
|
||||
new MockTokenizer(new StringReader("hello world"), MockTokenizer.WHITESPACE, false),
|
||||
new SingleTokenTokenStream(createToken("$", 0, 0)));
|
||||
|
||||
assertTokenStreamContents(ts,
|
||||
|
|
|
@ -18,8 +18,8 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
@ -41,7 +41,7 @@ public class TestPrefixAwareTokenFilter extends BaseTokenStreamTestCase {
|
|||
// prefix and suffix using 2x prefix
|
||||
|
||||
ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(createToken("^", 0, 0)),
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("hello world")));
|
||||
new MockTokenizer(new StringReader("hello world"), MockTokenizer.WHITESPACE, false));
|
||||
ts = new PrefixAwareTokenFilter(ts, new SingleTokenTokenStream(createToken("$", 0, 0)));
|
||||
|
||||
assertTokenStreamContents(ts,
|
||||
|
|
|
@ -19,12 +19,11 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
@ -127,8 +126,8 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void doSplit(final String input, String... output) throws Exception {
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
|
||||
new StringReader(input)), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, 1, 1, 0, 0, 0, 1, 0, 1, 1, null);
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new MockTokenizer(
|
||||
new StringReader(input), MockTokenizer.KEYWORD, false), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, 1, 1, 0, 0, 0, 1, 0, 1, 1, null);
|
||||
|
||||
assertTokenStreamContents(wdf, output);
|
||||
}
|
||||
|
@ -169,8 +168,8 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
|
||||
new StringReader(input)), 1,1,0,0,0,1,0,1,stemPossessive, null);
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new MockTokenizer(
|
||||
new StringReader(input), MockTokenizer.KEYWORD, false), 1,1,0,0,0,1,0,1,stemPossessive, null);
|
||||
|
||||
assertTokenStreamContents(wdf, output);
|
||||
}
|
||||
|
@ -216,7 +215,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
public TokenStream tokenStream(String field, Reader reader) {
|
||||
return new WordDelimiterFilter(
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader),
|
||||
new MockTokenizer(reader, MockTokenizer.WHITESPACE, false),
|
||||
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
|
||||
}
|
||||
};
|
||||
|
@ -244,7 +243,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
public TokenStream tokenStream(String field, Reader reader) {
|
||||
return new WordDelimiterFilter(
|
||||
new LargePosIncTokenFilter(
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader)),
|
||||
new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)),
|
||||
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
|
||||
}
|
||||
};
|
||||
|
@ -276,7 +275,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
public TokenStream tokenStream(String field, Reader reader) {
|
||||
StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), StandardAnalyzer.STOP_WORDS_SET);
|
||||
new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), StandardAnalyzer.STOP_WORDS_SET);
|
||||
filter.setEnablePositionIncrements(true);
|
||||
return new WordDelimiterFilter(filter,
|
||||
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis.ngram;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
|
@ -32,7 +33,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
input = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
|
||||
input = new MockTokenizer(new StringReader("abcde"), MockTokenizer.WHITESPACE, false);
|
||||
}
|
||||
|
||||
public void testInvalidInput() throws Exception {
|
||||
|
@ -91,7 +92,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testSmallTokenInStream() throws Exception {
|
||||
input = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abc de fgh"));
|
||||
input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis.ngram;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
|
@ -32,7 +33,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
input = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
|
||||
input = new MockTokenizer(new StringReader("abcde"), MockTokenizer.WHITESPACE, false);
|
||||
}
|
||||
|
||||
public void testInvalidInput() throws Exception {
|
||||
|
@ -80,7 +81,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testSmallTokenInStream() throws Exception {
|
||||
input = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abc de fgh"));
|
||||
input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
|
||||
assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
|
||||
}
|
||||
|
|
|
@ -24,8 +24,8 @@ import java.util.regex.Pattern;
|
|||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Tests {@link PatternReplaceCharFilter}
|
||||
|
@ -39,7 +39,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
|||
final String BLOCK = "this is test.";
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1$2$3",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "this", "is", "test." },
|
||||
new int[] { 0, 5, 8 },
|
||||
|
@ -52,8 +52,8 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
|||
final String BLOCK = "aa bb cc";
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
|
||||
assertFalse(ts.incrementToken());
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts, new String[] {});
|
||||
}
|
||||
|
||||
// 012345678
|
||||
|
@ -63,7 +63,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
|||
final String BLOCK = "aa bb cc";
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2#$3",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa#bb#cc" },
|
||||
new int[] { 0 },
|
||||
|
@ -78,7 +78,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
|||
final String BLOCK = "aa bb cc dd";
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa##bb###cc", "dd" },
|
||||
new int[] { 0, 9 },
|
||||
|
@ -92,7 +92,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
|||
final String BLOCK = " a a";
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa", "aa" },
|
||||
new int[] { 1, 4 },
|
||||
|
@ -107,7 +107,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
|||
final String BLOCK = "aa bb cc dd";
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa#bb", "dd" },
|
||||
new int[] { 0, 12 },
|
||||
|
@ -122,7 +122,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
|||
final String BLOCK = " aa bb cc --- aa bb aa bb cc";
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1 $2 $3",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" },
|
||||
new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 },
|
||||
|
@ -137,7 +137,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
|||
final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc";
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", ".",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" },
|
||||
new int[] { 2, 8, 11, 15, 21, 25, 28, 36 },
|
||||
|
@ -154,7 +154,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
|||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
cs = new PatternReplaceCharFilter( pattern("bb"), "b", ".", cs );
|
||||
cs = new PatternReplaceCharFilter( pattern("ccc"), "c", ".", cs );
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" },
|
||||
new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 },
|
||||
|
|
|
@ -18,8 +18,8 @@
|
|||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.regex.Pattern;
|
||||
|
@ -32,7 +32,7 @@ public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {
|
|||
public void testReplaceAll() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
|
||||
(new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false),
|
||||
Pattern.compile("a*b"),
|
||||
"-", true);
|
||||
assertTokenStreamContents(ts,
|
||||
|
@ -42,7 +42,7 @@ public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {
|
|||
public void testReplaceFirst() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
|
||||
(new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false),
|
||||
Pattern.compile("a*b"),
|
||||
"-", false);
|
||||
assertTokenStreamContents(ts,
|
||||
|
@ -52,7 +52,7 @@ public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {
|
|||
public void testStripFirst() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
|
||||
(new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false),
|
||||
Pattern.compile("a*b"),
|
||||
null, false);
|
||||
assertTokenStreamContents(ts,
|
||||
|
@ -62,7 +62,7 @@ public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {
|
|||
public void testStripAll() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
|
||||
(new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false),
|
||||
Pattern.compile("a*b"),
|
||||
null, true);
|
||||
assertTokenStreamContents(ts,
|
||||
|
@ -72,7 +72,7 @@ public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {
|
|||
public void testReplaceAllWithBackRef() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
|
||||
(new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false),
|
||||
Pattern.compile("(a*)b"),
|
||||
"$1\\$", true);
|
||||
assertTokenStreamContents(ts,
|
||||
|
|
|
@ -16,8 +16,8 @@ package org.apache.lucene.analysis.payloads;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.index.Payload;
|
||||
|
@ -30,7 +30,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
|||
public void testPayloads() throws Exception {
|
||||
String test = "The quick|JJ red|JJ fox|NN jumped|VB over the lazy|JJ brown|JJ dogs|NN";
|
||||
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter
|
||||
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)),
|
||||
(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false),
|
||||
DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
|
||||
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
|
||||
|
@ -51,7 +51,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
|||
|
||||
String test = "The quick|JJ red|JJ fox|NN jumped|VB over the lazy|JJ brown|JJ dogs|NN";
|
||||
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter
|
||||
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)),
|
||||
(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false),
|
||||
DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
|
||||
assertTermEquals("The", filter, null);
|
||||
assertTermEquals("quick", filter, "JJ".getBytes("UTF-8"));
|
||||
|
@ -69,7 +69,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
|||
|
||||
public void testFloatEncoding() throws Exception {
|
||||
String test = "The quick|1.0 red|2.0 fox|3.5 jumped|0.5 over the lazy|5 brown|99.3 dogs|83.7";
|
||||
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)), '|', new FloatEncoder());
|
||||
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false), '|', new FloatEncoder());
|
||||
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
|
||||
assertTermEquals("The", filter, termAtt, payAtt, null);
|
||||
|
@ -87,7 +87,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
|||
|
||||
public void testIntEncoding() throws Exception {
|
||||
String test = "The quick|1 red|2 fox|3 jumped over the lazy|5 brown|99 dogs|83";
|
||||
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)), '|', new IntegerEncoder());
|
||||
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false), '|', new IntegerEncoder());
|
||||
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
|
||||
assertTermEquals("The", filter, termAtt, payAtt, null);
|
||||
|
@ -106,6 +106,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
|||
void assertTermEquals(String expected, TokenStream stream, byte[] expectPay) throws Exception {
|
||||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||
PayloadAttribute payloadAtt = stream.getAttribute(PayloadAttribute.class);
|
||||
stream.reset();
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals(expected, termAtt.toString());
|
||||
Payload payload = payloadAtt.getPayload();
|
||||
|
@ -122,6 +123,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
|||
|
||||
|
||||
void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, PayloadAttribute payAtt, byte[] expectPay) throws Exception {
|
||||
stream.reset();
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals(expected, termAtt.toString());
|
||||
Payload payload = payAtt.getPayload();
|
||||
|
|
|
@ -17,9 +17,9 @@ package org.apache.lucene.analysis.payloads;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
@ -32,11 +32,12 @@ public class NumericPayloadTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
public void test() throws IOException {
|
||||
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||
|
||||
NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test))), 3, "D");
|
||||
NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false)), 3, "D");
|
||||
boolean seenDogs = false;
|
||||
CharTermAttribute termAtt = nptf.getAttribute(CharTermAttribute.class);
|
||||
TypeAttribute typeAtt = nptf.getAttribute(TypeAttribute.class);
|
||||
PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
|
||||
nptf.reset();
|
||||
while (nptf.incrementToken()) {
|
||||
if (termAtt.toString().equals("dogs")) {
|
||||
seenDogs = true;
|
||||
|
|
|
@ -17,7 +17,7 @@ package org.apache.lucene.analysis.payloads;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.index.Payload;
|
||||
|
@ -30,11 +30,11 @@ public class TokenOffsetPayloadTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
public void test() throws IOException {
|
||||
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||
|
||||
TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)));
|
||||
TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false));
|
||||
int count = 0;
|
||||
PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
|
||||
OffsetAttribute offsetAtt = nptf.getAttribute(OffsetAttribute.class);
|
||||
|
||||
nptf.reset();
|
||||
while (nptf.incrementToken()) {
|
||||
Payload pay = payloadAtt.getPayload();
|
||||
assertTrue("pay is null and it shouldn't be", pay != null);
|
||||
|
|
|
@ -17,9 +17,9 @@ package org.apache.lucene.analysis.payloads;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
@ -32,12 +32,12 @@ public class TypeAsPayloadTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
public void test() throws IOException {
|
||||
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||
|
||||
TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test))));
|
||||
TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false)));
|
||||
int count = 0;
|
||||
CharTermAttribute termAtt = nptf.getAttribute(CharTermAttribute.class);
|
||||
TypeAttribute typeAtt = nptf.getAttribute(TypeAttribute.class);
|
||||
PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
|
||||
|
||||
nptf.reset();
|
||||
while (nptf.incrementToken()) {
|
||||
assertTrue(typeAtt.type() + " is not null and it should be", typeAtt.type().equals(String.valueOf(Character.toUpperCase(termAtt.buffer()[0]))));
|
||||
assertTrue("nextToken.getPayload() is null and it shouldn't be", payloadAtt.getPayload() != null);
|
||||
|
|
|
@ -22,6 +22,8 @@ import java.io.StringReader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
||||
|
@ -50,7 +52,7 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
dir = new RAMDirectory();
|
||||
appAnalyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
|
||||
appAnalyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
|
||||
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, appAnalyzer));
|
||||
int numDocs = 200;
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
|
@ -159,9 +161,9 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
if (++invocationCount % 2 == 0)
|
||||
return new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
else
|
||||
return new LetterTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new MockTokenizer(reader, MockTokenizer.SIMPLE, false);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -175,7 +177,7 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testTokenStream() throws Exception {
|
||||
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
|
||||
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
|
||||
a.addStopWords(reader, 10);
|
||||
TokenStream ts = a.tokenStream("repetitiveField", new StringReader("this boring"));
|
||||
assertTokenStreamContents(ts, new String[] { "this" });
|
||||
|
|
|
@ -19,22 +19,22 @@ package org.apache.lucene.analysis.reverse;
|
|||
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestReverseStringFilter extends BaseTokenStreamTestCase {
|
||||
public void testFilter() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||
new StringReader("Do have a nice day")); // 1-4 length string
|
||||
TokenStream stream = new MockTokenizer(new StringReader("Do have a nice day"),
|
||||
MockTokenizer.WHITESPACE, false); // 1-4 length string
|
||||
ReverseStringFilter filter = new ReverseStringFilter(TEST_VERSION_CURRENT, stream);
|
||||
assertTokenStreamContents(filter, new String[] { "oD", "evah", "a", "ecin", "yad" });
|
||||
}
|
||||
|
||||
public void testFilterWithMark() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||
"Do have a nice day")); // 1-4 length string
|
||||
TokenStream stream = new MockTokenizer(new StringReader("Do have a nice day"),
|
||||
MockTokenizer.WHITESPACE, false); // 1-4 length string
|
||||
ReverseStringFilter filter = new ReverseStringFilter(TEST_VERSION_CURRENT, stream, '\u0001');
|
||||
assertTokenStreamContents(filter,
|
||||
new String[] { "\u0001oD", "\u0001evah", "\u0001a", "\u0001ecin", "\u0001yad" });
|
||||
|
|
|
@ -22,8 +22,8 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||
|
@ -36,7 +36,7 @@ public class TestRussianLightStemFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new RussianLightStemFilter(source));
|
||||
}
|
||||
};
|
||||
|
|
|
@ -22,10 +22,9 @@ import java.io.StringReader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -106,7 +105,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
*/
|
||||
public void testShingleAnalyzerWrapperQueryParsing() throws Exception {
|
||||
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
|
||||
(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2),
|
||||
(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2),
|
||||
"test sentence");
|
||||
int[] ranks = new int[] { 1, 2, 0 };
|
||||
compareRanks(hits, ranks);
|
||||
|
@ -117,7 +116,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
*/
|
||||
public void testShingleAnalyzerWrapperPhraseQueryParsingFails() throws Exception {
|
||||
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
|
||||
(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2),
|
||||
(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2),
|
||||
"\"this sentence\"");
|
||||
int[] ranks = new int[] { 0 };
|
||||
compareRanks(hits, ranks);
|
||||
|
@ -128,7 +127,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
*/
|
||||
public void testShingleAnalyzerWrapperPhraseQueryParsing() throws Exception {
|
||||
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
|
||||
(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2),
|
||||
(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2),
|
||||
"\"test sentence\"");
|
||||
int[] ranks = new int[] { 1 };
|
||||
compareRanks(hits, ranks);
|
||||
|
@ -139,7 +138,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
*/
|
||||
public void testShingleAnalyzerWrapperRequiredQueryParsing() throws Exception {
|
||||
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
|
||||
(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2),
|
||||
(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2),
|
||||
"+test +sentence");
|
||||
int[] ranks = new int[] { 1, 2 };
|
||||
compareRanks(hits, ranks);
|
||||
|
@ -149,7 +148,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
* This shows how to construct a phrase query containing shingles.
|
||||
*/
|
||||
public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
|
||||
Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2);
|
||||
Analyzer analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2);
|
||||
searcher = setUpSearcher(analyzer);
|
||||
|
||||
PhraseQuery q = new PhraseQuery();
|
||||
|
@ -161,6 +160,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
j += posIncrAtt.getPositionIncrement();
|
||||
String termText = termAtt.toString();
|
||||
|
@ -178,7 +178,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
* in the right order and adjacent to each other.
|
||||
*/
|
||||
public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
|
||||
Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2);
|
||||
Analyzer analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2);
|
||||
searcher = setUpSearcher(analyzer);
|
||||
|
||||
BooleanQuery q = new BooleanQuery();
|
||||
|
@ -188,6 +188,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
|
||||
ts.reset();
|
||||
|
||||
while (ts.incrementToken()) {
|
||||
String termText = termAtt.toString();
|
||||
q.add(new TermQuery(new Term("content", termText)),
|
||||
|
@ -200,7 +202,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2);
|
||||
Analyzer a = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2);
|
||||
assertAnalyzesToReuse(a, "please divide into shingles",
|
||||
new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" },
|
||||
new int[] { 0, 0, 7, 7, 14, 14, 19 },
|
||||
|
@ -222,9 +224,9 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
if (++invocationCount % 2 == 0)
|
||||
return new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
else
|
||||
return new LetterTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new MockTokenizer(reader, MockTokenizer.SIMPLE, false);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -249,7 +251,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testNonDefaultMinShingleSize() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer
|
||||
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 3, 4);
|
||||
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 3, 4);
|
||||
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
|
||||
new String[] { "please", "please divide this", "please divide this sentence",
|
||||
"divide", "divide this sentence", "divide this sentence into",
|
||||
|
@ -273,7 +275,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testNonDefaultMinAndSameMaxShingleSize() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer
|
||||
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 3, 3);
|
||||
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 3, 3);
|
||||
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
|
||||
new String[] { "please", "please divide this",
|
||||
"divide", "divide this sentence",
|
||||
|
@ -297,7 +299,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testNoTokenSeparator() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer
|
||||
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
|
||||
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
|
||||
analyzer.setTokenSeparator("");
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
new String[] { "please", "pleasedivide",
|
||||
|
@ -319,7 +321,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testNullTokenSeparator() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer
|
||||
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
|
||||
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
|
||||
analyzer.setTokenSeparator(null);
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
new String[] { "please", "pleasedivide",
|
||||
|
@ -340,7 +342,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
public void testAltTokenSeparator() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer
|
||||
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
|
||||
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
|
||||
analyzer.setTokenSeparator("<SEP>");
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
new String[] { "please", "please<SEP>divide",
|
||||
|
@ -362,7 +364,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer
|
||||
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
|
||||
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
|
||||
analyzer.setOutputUnigrams(false);
|
||||
analyzer.setOutputUnigramsIfNoShingles(true);
|
||||
assertAnalyzesToReuse(analyzer, "please",
|
||||
|
|
|
@ -22,14 +22,14 @@ import java.text.SimpleDateFormat;
|
|||
import java.util.Locale;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
|
||||
public class DateRecognizerSinkTokenizerTest extends BaseTokenStreamTestCase {
|
||||
|
||||
public void test() throws IOException {
|
||||
DateRecognizerSinkFilter sinkFilter = new DateRecognizerSinkFilter(new SimpleDateFormat("MM/dd/yyyy", Locale.US));
|
||||
String test = "The quick red fox jumped over the lazy brown dogs on 7/11/2006 The dogs finally reacted on 7/12/2006";
|
||||
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)));
|
||||
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false));
|
||||
TeeSinkTokenFilter.SinkTokenStream sink = tee.newSinkTokenStream(sinkFilter);
|
||||
int count = 0;
|
||||
|
||||
|
|
|
@ -84,7 +84,7 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
|
|||
// with BaseTokenStreamTestCase now...
|
||||
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
Analyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
|
||||
Analyzer analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
|
||||
Document doc = new Document();
|
||||
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(analyzer.tokenStream("field", new StringReader("abcd ")));
|
||||
|
@ -108,7 +108,7 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testGeneral() throws IOException {
|
||||
final TeeSinkTokenFilter source = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer1.toString())));
|
||||
final TeeSinkTokenFilter source = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
|
||||
final TokenStream sink1 = source.newSinkTokenStream();
|
||||
final TokenStream sink2 = source.newSinkTokenStream(theFilter);
|
||||
|
||||
|
@ -122,16 +122,17 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testMultipleSources() throws Exception {
|
||||
final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer1.toString())));
|
||||
final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
|
||||
final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
|
||||
final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
|
||||
tee1.reset();
|
||||
final TokenStream source1 = new CachingTokenFilter(tee1);
|
||||
|
||||
tee1.addAttribute(CheckClearAttributesAttribute.class);
|
||||
dogDetector.addAttribute(CheckClearAttributesAttribute.class);
|
||||
theDetector.addAttribute(CheckClearAttributesAttribute.class);
|
||||
|
||||
final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer2.toString())));
|
||||
final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.toString()), MockTokenizer.WHITESPACE, false));
|
||||
tee2.addSinkTokenStream(dogDetector);
|
||||
tee2.addSinkTokenStream(theDetector);
|
||||
final TokenStream source2 = tee2;
|
||||
|
|
|
@ -20,14 +20,14 @@ import java.io.IOException;
|
|||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
|
||||
public class TokenRangeSinkTokenizerTest extends BaseTokenStreamTestCase {
|
||||
|
||||
public void test() throws IOException {
|
||||
TokenRangeSinkFilter sinkFilter = new TokenRangeSinkFilter(2, 4);
|
||||
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)));
|
||||
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false));
|
||||
TeeSinkTokenFilter.SinkTokenStream rangeToks = tee.newSinkTokenStream(sinkFilter);
|
||||
|
||||
int count = 0;
|
||||
|
|
|
@ -20,9 +20,9 @@ import java.io.IOException;
|
|||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
|
@ -32,7 +32,7 @@ public class TokenTypeSinkTokenizerTest extends BaseTokenStreamTestCase {
|
|||
TokenTypeSinkFilter sinkFilter = new TokenTypeSinkFilter("D");
|
||||
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||
|
||||
TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test))));
|
||||
TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false)));
|
||||
TeeSinkTokenFilter.SinkTokenStream sink = ttf.newSinkTokenStream(sinkFilter);
|
||||
|
||||
boolean seenDogs = false;
|
||||
|
|
|
@ -22,8 +22,8 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||
|
@ -36,7 +36,7 @@ public class TestSwedishLightStemFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new SwedishLightStemFilter(source));
|
||||
}
|
||||
};
|
||||
|
|
|
@ -25,6 +25,7 @@ import java.util.Collection;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
@ -43,14 +44,14 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
static void assertTokenizesTo(SynonymMap dict, String input,
|
||||
String expected[]) throws IOException {
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
|
||||
assertTokenStreamContents(stream, expected);
|
||||
}
|
||||
|
||||
static void assertTokenizesTo(SynonymMap dict, String input,
|
||||
String expected[], int posIncs[]) throws IOException {
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
|
||||
assertTokenStreamContents(stream, expected, posIncs);
|
||||
}
|
||||
|
|
|
@ -20,8 +20,8 @@ package org.apache.lucene.analysis.tr;
|
|||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Test the Turkish lowercase filter.
|
||||
|
@ -32,8 +32,8 @@ public class TestTurkishLowerCaseFilter extends BaseTokenStreamTestCase {
|
|||
* Test composed forms
|
||||
*/
|
||||
public void testTurkishLowerCaseFilter() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||
"\u0130STANBUL \u0130ZM\u0130R ISPARTA"));
|
||||
TokenStream stream = new MockTokenizer(new StringReader(
|
||||
"\u0130STANBUL \u0130ZM\u0130R ISPARTA"), MockTokenizer.WHITESPACE, false);
|
||||
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
|
||||
assertTokenStreamContents(filter, new String[] {"istanbul", "izmir",
|
||||
"\u0131sparta",});
|
||||
|
@ -43,8 +43,8 @@ public class TestTurkishLowerCaseFilter extends BaseTokenStreamTestCase {
|
|||
* Test decomposed forms
|
||||
*/
|
||||
public void testDecomposed() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||
"\u0049\u0307STANBUL \u0049\u0307ZM\u0049\u0307R ISPARTA"));
|
||||
TokenStream stream = new MockTokenizer(new StringReader(
|
||||
"\u0049\u0307STANBUL \u0049\u0307ZM\u0049\u0307R ISPARTA"), MockTokenizer.WHITESPACE, false);
|
||||
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
|
||||
assertTokenStreamContents(filter, new String[] {"istanbul", "izmir",
|
||||
"\u0131sparta",});
|
||||
|
@ -56,8 +56,8 @@ public class TestTurkishLowerCaseFilter extends BaseTokenStreamTestCase {
|
|||
* to U+0130 + U+0316, and is lowercased the same way.
|
||||
*/
|
||||
public void testDecomposed2() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||
"\u0049\u0316\u0307STANBUL \u0049\u0307ZM\u0049\u0307R I\u0316SPARTA"));
|
||||
TokenStream stream = new MockTokenizer(new StringReader(
|
||||
"\u0049\u0316\u0307STANBUL \u0049\u0307ZM\u0049\u0307R I\u0316SPARTA"), MockTokenizer.WHITESPACE, false);
|
||||
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
|
||||
assertTokenStreamContents(filter, new String[] {"i\u0316stanbul", "izmir",
|
||||
"\u0131\u0316sparta",});
|
||||
|
|
Loading…
Reference in New Issue