mirror of https://github.com/apache/lucene.git
LUCENE-5353: ShingleFilter's filler token should be configurable
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1562639 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4e9a524eda
commit
9feffa8472
|
@ -119,6 +119,9 @@ New Features
|
|||
encode term metadata, and all dictionary implementations can now plug in any
|
||||
PostingsBaseFormat. (Han Jiang, Mike McCandless)
|
||||
|
||||
* LUCENE-5353: ShingleFilter's filler token should be configurable.
|
||||
(Ahmet Arslan, Simon Willnauer, Steve Rowe)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-5217,LUCENE-5420: Maven config: get dependencies from Ant+Ivy config;
|
||||
|
|
|
@ -36,6 +36,7 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
|
|||
private final String tokenSeparator;
|
||||
private final boolean outputUnigrams;
|
||||
private final boolean outputUnigramsIfNoShingles;
|
||||
private final String fillerToken;
|
||||
|
||||
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
|
||||
this(defaultAnalyzer, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
|
||||
|
@ -46,7 +47,8 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
|
|||
}
|
||||
|
||||
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) {
|
||||
this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.TOKEN_SEPARATOR, true, false);
|
||||
this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
|
||||
true, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -63,6 +65,7 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
|
|||
* minShingleSize tokens in the input stream)?
|
||||
* Note that if outputUnigrams==true, then unigrams are always output,
|
||||
* regardless of whether any shingles are available.
|
||||
* @param fillerToken filler token to use when positionIncrement is more than 1
|
||||
*/
|
||||
public ShingleAnalyzerWrapper(
|
||||
Analyzer delegate,
|
||||
|
@ -70,7 +73,8 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
|
|||
int maxShingleSize,
|
||||
String tokenSeparator,
|
||||
boolean outputUnigrams,
|
||||
boolean outputUnigramsIfNoShingles) {
|
||||
boolean outputUnigramsIfNoShingles,
|
||||
String fillerToken) {
|
||||
super(delegate.getReuseStrategy());
|
||||
this.delegate = delegate;
|
||||
|
||||
|
@ -91,6 +95,7 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
|
|||
this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator);
|
||||
this.outputUnigrams = outputUnigrams;
|
||||
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
|
||||
this.fillerToken = fillerToken;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -137,6 +142,10 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
|
|||
return outputUnigramsIfNoShingles;
|
||||
}
|
||||
|
||||
public String getFillerToken() {
|
||||
return fillerToken;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Analyzer getWrappedAnalyzer(String fieldName) {
|
||||
return delegate;
|
||||
|
@ -150,6 +159,7 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
|
|||
filter.setTokenSeparator(tokenSeparator);
|
||||
filter.setOutputUnigrams(outputUnigrams);
|
||||
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
||||
filter.setFillerToken(fillerToken);
|
||||
return new TokenStreamComponents(components.getTokenizer(), filter);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -47,7 +47,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
/**
|
||||
* filler token for when positionIncrement is more than 1
|
||||
*/
|
||||
public static final char[] FILLER_TOKEN = { '_' };
|
||||
public static final String DEFAULT_FILLER_TOKEN = "_";
|
||||
|
||||
/**
|
||||
* default maximum shingle size is 2.
|
||||
|
@ -67,7 +67,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
/**
|
||||
* The default string to use when joining adjacent tokens to form a shingle
|
||||
*/
|
||||
public static final String TOKEN_SEPARATOR = " ";
|
||||
public static final String DEFAULT_TOKEN_SEPARATOR = " ";
|
||||
|
||||
/**
|
||||
* The sequence of input stream tokens (or filler tokens, if necessary)
|
||||
|
@ -95,7 +95,13 @@ public final class ShingleFilter extends TokenFilter {
|
|||
/**
|
||||
* The string to use when joining adjacent tokens to form a shingle
|
||||
*/
|
||||
private String tokenSeparator = TOKEN_SEPARATOR;
|
||||
private String tokenSeparator = DEFAULT_TOKEN_SEPARATOR;
|
||||
|
||||
/**
|
||||
* The string to insert for each position at which there is no token
|
||||
* (i.e., when position increment is greater than one).
|
||||
*/
|
||||
private char[] fillerToken = DEFAULT_FILLER_TOKEN.toCharArray();
|
||||
|
||||
/**
|
||||
* By default, we output unigrams (individual tokens) as well as shingles
|
||||
|
@ -284,6 +290,16 @@ public final class ShingleFilter extends TokenFilter {
|
|||
this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the string to insert for each position at which there is no token
|
||||
* (i.e., when position increment is greater than one).
|
||||
*
|
||||
* @param fillerToken string to insert at each position where there is no token
|
||||
*/
|
||||
public void setFillerToken(String fillerToken) {
|
||||
this.fillerToken = null == fillerToken ? new char[0] : fillerToken.toCharArray();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
boolean tokenAvailable = false;
|
||||
|
@ -341,7 +357,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
/**
|
||||
* <p>Get the next token from the input stream.
|
||||
* <p>If the next token has <code>positionIncrement > 1</code>,
|
||||
* <code>positionIncrement - 1</code> {@link #FILLER_TOKEN}s are
|
||||
* <code>positionIncrement - 1</code> {@link #fillerToken}s are
|
||||
* inserted first.
|
||||
* @param target Where to put the new token; if null, a new instance is created.
|
||||
* @return On success, the populated token; null otherwise
|
||||
|
@ -359,7 +375,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
// A filler token occupies no space
|
||||
newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(),
|
||||
newTarget.offsetAtt.startOffset());
|
||||
newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
|
||||
newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
|
||||
newTarget.isFiller = true;
|
||||
--numFillerTokensToInsert;
|
||||
} else if (isNextInputStreamToken) {
|
||||
|
@ -390,7 +406,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
isNextInputStreamToken = true;
|
||||
// A filler token occupies no space
|
||||
newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
|
||||
newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
|
||||
newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
|
||||
newTarget.isFiller = true;
|
||||
--numFillerTokensToInsert;
|
||||
} else {
|
||||
|
|
|
@ -29,7 +29,7 @@ import java.util.Map;
|
|||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="2"
|
||||
* outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" "/>
|
||||
* outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" " fillerToken="_"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
|
@ -39,6 +39,7 @@ public class ShingleFilterFactory extends TokenFilterFactory {
|
|||
private final boolean outputUnigrams;
|
||||
private final boolean outputUnigramsIfNoShingles;
|
||||
private final String tokenSeparator;
|
||||
private final String fillerToken;
|
||||
|
||||
/** Creates a new ShingleFilterFactory */
|
||||
public ShingleFilterFactory(Map<String, String> args) {
|
||||
|
@ -57,7 +58,8 @@ public class ShingleFilterFactory extends TokenFilterFactory {
|
|||
}
|
||||
outputUnigrams = getBoolean(args, "outputUnigrams", true);
|
||||
outputUnigramsIfNoShingles = getBoolean(args, "outputUnigramsIfNoShingles", false);
|
||||
tokenSeparator = get(args, "tokenSeparator", ShingleFilter.TOKEN_SEPARATOR);
|
||||
tokenSeparator = get(args, "tokenSeparator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
|
||||
fillerToken = get(args, "fillerToken", ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -69,6 +71,7 @@ public class ShingleFilterFactory extends TokenFilterFactory {
|
|||
r.setOutputUnigrams(outputUnigrams);
|
||||
r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
||||
r.setTokenSeparator(tokenSeparator);
|
||||
r.setFillerToken(fillerToken);
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,9 +21,13 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
|
@ -169,7 +173,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 });
|
||||
|
||||
analyzer = new ShingleAnalyzerWrapper(
|
||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4, ShingleFilter.TOKEN_SEPARATOR, false, false);
|
||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4,
|
||||
ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
|
||||
new String[] { "please divide this", "please divide this sentence",
|
||||
"divide this sentence", "divide this sentence into",
|
||||
|
@ -195,7 +200,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 });
|
||||
|
||||
analyzer = new ShingleAnalyzerWrapper(
|
||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3, ShingleFilter.TOKEN_SEPARATOR, false, false);
|
||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3,
|
||||
ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
|
||||
new String[] { "please divide this",
|
||||
"divide this sentence",
|
||||
|
@ -211,7 +217,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
"", true, false);
|
||||
"", true, false,
|
||||
ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||
new String[] { "please", "pleasedivide",
|
||||
"divide", "divideinto",
|
||||
|
@ -225,7 +232,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
"", false, false);
|
||||
"", false, false,
|
||||
ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||
new String[] { "pleasedivide",
|
||||
"divideinto",
|
||||
|
@ -240,7 +248,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
null, true, false);
|
||||
null, true, false,
|
||||
ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||
new String[] { "please", "pleasedivide",
|
||||
"divide", "divideinto",
|
||||
|
@ -254,7 +263,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
"", false, false);
|
||||
"", false, false,
|
||||
ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||
new String[] { "pleasedivide",
|
||||
"divideinto",
|
||||
|
@ -263,12 +273,14 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
new int[] { 13, 18, 27 },
|
||||
new int[] { 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testAltTokenSeparator() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
|
||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
"<SEP>", true, false);
|
||||
"<SEP>", true, false,
|
||||
ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||
new String[] { "please", "please<SEP>divide",
|
||||
"divide", "divide<SEP>into",
|
||||
|
@ -282,7 +294,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
"<SEP>", false, false);
|
||||
"<SEP>", false, false,
|
||||
ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||
new String[] { "please<SEP>divide",
|
||||
"divide<SEP>into",
|
||||
|
@ -291,13 +304,64 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
new int[] { 13, 18, 27 },
|
||||
new int[] { 1, 1, 1 });
|
||||
}
|
||||
|
||||
|
||||
public void testAltFillerToken() throws Exception {
|
||||
Analyzer delegate = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "into");
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
TokenFilter filter = new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
};
|
||||
|
||||
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
|
||||
delegate,
|
||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
|
||||
true, false, "--");
|
||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||
new String[] { "please", "please divide",
|
||||
"divide", "divide --",
|
||||
"-- shingles", "shingles" },
|
||||
new int[] { 0, 0, 7, 7, 19, 19 },
|
||||
new int[] { 6, 13, 13, 19, 27, 27 },
|
||||
new int[] { 1, 0, 1, 0, 1, 1 });
|
||||
|
||||
analyzer = new ShingleAnalyzerWrapper(
|
||||
delegate,
|
||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
|
||||
false, false, null);
|
||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||
new String[] { "please divide", "divide ", " shingles" },
|
||||
new int[] { 0, 7, 19 },
|
||||
new int[] { 13, 19, 27 },
|
||||
new int[] { 1, 1, 1 });
|
||||
|
||||
analyzer = new ShingleAnalyzerWrapper(
|
||||
delegate,
|
||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
|
||||
false, false, "");
|
||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||
new String[] { "please divide", "divide ", " shingles" },
|
||||
new int[] { 0, 7, 19 },
|
||||
new int[] { 13, 19, 27 },
|
||||
new int[] { 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
|
||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
"", false, true);
|
||||
"", false, true,
|
||||
ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||
assertAnalyzesTo(analyzer, "please",
|
||||
new String[] { "please" },
|
||||
new int[] { 0 },
|
||||
|
|
|
@ -1196,4 +1196,52 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
new int[] {1, 0, 0, 1, 0, 0},
|
||||
20);
|
||||
}
|
||||
|
||||
public void testTwoTrailingHolesTriShingleWithTokenFiller() throws IOException {
|
||||
// Analyzing "purple wizard of the", where of and the are removed as a
|
||||
// stopwords, leaving two trailing holes:
|
||||
Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)};
|
||||
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
|
||||
filter.setFillerToken("--");
|
||||
|
||||
assertTokenStreamContents(filter,
|
||||
new String[]{"purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --"},
|
||||
new int[]{0, 0, 0, 7, 7, 7},
|
||||
new int[]{6, 13, 20, 13, 20, 20},
|
||||
new int[]{1, 0, 0, 1, 0, 0},
|
||||
20);
|
||||
|
||||
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
|
||||
filter.setFillerToken("");
|
||||
|
||||
assertTokenStreamContents(filter,
|
||||
new String[]{"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard "},
|
||||
new int[]{0, 0, 0, 7, 7, 7},
|
||||
new int[]{6, 13, 20, 13, 20, 20},
|
||||
new int[]{1, 0, 0, 1, 0, 0},
|
||||
20);
|
||||
|
||||
|
||||
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
|
||||
filter.setFillerToken(null);
|
||||
|
||||
assertTokenStreamContents(filter,
|
||||
new String[] {"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard "},
|
||||
new int[] {0, 0, 0, 7, 7, 7},
|
||||
new int[] {6, 13, 20, 13, 20, 20},
|
||||
new int[] {1, 0, 0, 1, 0, 0},
|
||||
20);
|
||||
|
||||
|
||||
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
|
||||
filter.setFillerToken(null);
|
||||
filter.setTokenSeparator(null);
|
||||
|
||||
assertTokenStreamContents(filter,
|
||||
new String[] {"purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard"},
|
||||
new int[] {0, 0, 0, 7, 7, 7},
|
||||
new int[] {6, 13, 20, 13, 20, 20},
|
||||
new int[] {1, 0, 0, 1, 0, 0},
|
||||
20);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue