LUCENE-5353: ShingleFilter's filler token should be configurable

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1562639 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2014-01-29 23:34:48 +00:00
parent 4e9a524eda
commit 9feffa8472
6 changed files with 164 additions and 20 deletions

View File

@ -119,6 +119,9 @@ New Features
encode term metadata, and all dictionary implementations can now plug in any encode term metadata, and all dictionary implementations can now plug in any
PostingsBaseFormat. (Han Jiang, Mike McCandless) PostingsBaseFormat. (Han Jiang, Mike McCandless)
* LUCENE-5353: ShingleFilter's filler token should be configurable.
(Ahmet Arslan, Simon Willnauer, Steve Rowe)
Build Build
* LUCENE-5217,LUCENE-5420: Maven config: get dependencies from Ant+Ivy config; * LUCENE-5217,LUCENE-5420: Maven config: get dependencies from Ant+Ivy config;

View File

@ -36,6 +36,7 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
private final String tokenSeparator; private final String tokenSeparator;
private final boolean outputUnigrams; private final boolean outputUnigrams;
private final boolean outputUnigramsIfNoShingles; private final boolean outputUnigramsIfNoShingles;
private final String fillerToken;
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) { public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
this(defaultAnalyzer, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); this(defaultAnalyzer, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
@ -46,7 +47,8 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
} }
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) { public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) {
this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.TOKEN_SEPARATOR, true, false); this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
true, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
} }
/** /**
@ -63,6 +65,7 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
* minShingleSize tokens in the input stream)? * minShingleSize tokens in the input stream)?
* Note that if outputUnigrams==true, then unigrams are always output, * Note that if outputUnigrams==true, then unigrams are always output,
* regardless of whether any shingles are available. * regardless of whether any shingles are available.
* @param fillerToken filler token to use when positionIncrement is more than 1
*/ */
public ShingleAnalyzerWrapper( public ShingleAnalyzerWrapper(
Analyzer delegate, Analyzer delegate,
@ -70,7 +73,8 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
int maxShingleSize, int maxShingleSize,
String tokenSeparator, String tokenSeparator,
boolean outputUnigrams, boolean outputUnigrams,
boolean outputUnigramsIfNoShingles) { boolean outputUnigramsIfNoShingles,
String fillerToken) {
super(delegate.getReuseStrategy()); super(delegate.getReuseStrategy());
this.delegate = delegate; this.delegate = delegate;
@ -91,6 +95,7 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator); this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator);
this.outputUnigrams = outputUnigrams; this.outputUnigrams = outputUnigrams;
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles; this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
this.fillerToken = fillerToken;
} }
/** /**
@ -137,6 +142,10 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
return outputUnigramsIfNoShingles; return outputUnigramsIfNoShingles;
} }
public String getFillerToken() {
return fillerToken;
}
@Override @Override
public final Analyzer getWrappedAnalyzer(String fieldName) { public final Analyzer getWrappedAnalyzer(String fieldName) {
return delegate; return delegate;
@ -150,6 +159,7 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
filter.setTokenSeparator(tokenSeparator); filter.setTokenSeparator(tokenSeparator);
filter.setOutputUnigrams(outputUnigrams); filter.setOutputUnigrams(outputUnigrams);
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
filter.setFillerToken(fillerToken);
return new TokenStreamComponents(components.getTokenizer(), filter); return new TokenStreamComponents(components.getTokenizer(), filter);
} }
} }

View File

@ -47,7 +47,7 @@ public final class ShingleFilter extends TokenFilter {
/** /**
* filler token for when positionIncrement is more than 1 * filler token for when positionIncrement is more than 1
*/ */
public static final char[] FILLER_TOKEN = { '_' }; public static final String DEFAULT_FILLER_TOKEN = "_";
/** /**
* default maximum shingle size is 2. * default maximum shingle size is 2.
@ -67,7 +67,7 @@ public final class ShingleFilter extends TokenFilter {
/** /**
* The default string to use when joining adjacent tokens to form a shingle * The default string to use when joining adjacent tokens to form a shingle
*/ */
public static final String TOKEN_SEPARATOR = " "; public static final String DEFAULT_TOKEN_SEPARATOR = " ";
/** /**
* The sequence of input stream tokens (or filler tokens, if necessary) * The sequence of input stream tokens (or filler tokens, if necessary)
@ -95,7 +95,13 @@ public final class ShingleFilter extends TokenFilter {
/** /**
* The string to use when joining adjacent tokens to form a shingle * The string to use when joining adjacent tokens to form a shingle
*/ */
private String tokenSeparator = TOKEN_SEPARATOR; private String tokenSeparator = DEFAULT_TOKEN_SEPARATOR;
/**
* The string to insert for each position at which there is no token
* (i.e., when position increment is greater than one).
*/
private char[] fillerToken = DEFAULT_FILLER_TOKEN.toCharArray();
/** /**
* By default, we output unigrams (individual tokens) as well as shingles * By default, we output unigrams (individual tokens) as well as shingles
@ -284,6 +290,16 @@ public final class ShingleFilter extends TokenFilter {
this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator; this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator;
} }
/**
* Sets the string to insert for each position at which there is no token
* (i.e., when position increment is greater than one).
*
* @param fillerToken string to insert at each position where there is no token
*/
public void setFillerToken(String fillerToken) {
this.fillerToken = null == fillerToken ? new char[0] : fillerToken.toCharArray();
}
@Override @Override
public boolean incrementToken() throws IOException { public boolean incrementToken() throws IOException {
boolean tokenAvailable = false; boolean tokenAvailable = false;
@ -341,7 +357,7 @@ public final class ShingleFilter extends TokenFilter {
/** /**
* <p>Get the next token from the input stream. * <p>Get the next token from the input stream.
* <p>If the next token has <code>positionIncrement > 1</code>, * <p>If the next token has <code>positionIncrement > 1</code>,
* <code>positionIncrement - 1</code> {@link #FILLER_TOKEN}s are * <code>positionIncrement - 1</code> {@link #fillerToken}s are
* inserted first. * inserted first.
* @param target Where to put the new token; if null, a new instance is created. * @param target Where to put the new token; if null, a new instance is created.
* @return On success, the populated token; null otherwise * @return On success, the populated token; null otherwise
@ -359,7 +375,7 @@ public final class ShingleFilter extends TokenFilter {
// A filler token occupies no space // A filler token occupies no space
newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(),
newTarget.offsetAtt.startOffset()); newTarget.offsetAtt.startOffset());
newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length); newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true; newTarget.isFiller = true;
--numFillerTokensToInsert; --numFillerTokensToInsert;
} else if (isNextInputStreamToken) { } else if (isNextInputStreamToken) {
@ -390,7 +406,7 @@ public final class ShingleFilter extends TokenFilter {
isNextInputStreamToken = true; isNextInputStreamToken = true;
// A filler token occupies no space // A filler token occupies no space
newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset()); newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length); newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true; newTarget.isFiller = true;
--numFillerTokensToInsert; --numFillerTokensToInsert;
} else { } else {

View File

@ -29,7 +29,7 @@ import java.util.Map;
* &lt;analyzer&gt; * &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt; * &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="2" * &lt;filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="2"
* outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" "/&gt; * outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" " fillerToken="_"/&gt;
* &lt;/analyzer&gt; * &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre> * &lt;/fieldType&gt;</pre>
*/ */
@ -39,6 +39,7 @@ public class ShingleFilterFactory extends TokenFilterFactory {
private final boolean outputUnigrams; private final boolean outputUnigrams;
private final boolean outputUnigramsIfNoShingles; private final boolean outputUnigramsIfNoShingles;
private final String tokenSeparator; private final String tokenSeparator;
private final String fillerToken;
/** Creates a new ShingleFilterFactory */ /** Creates a new ShingleFilterFactory */
public ShingleFilterFactory(Map<String, String> args) { public ShingleFilterFactory(Map<String, String> args) {
@ -57,7 +58,8 @@ public class ShingleFilterFactory extends TokenFilterFactory {
} }
outputUnigrams = getBoolean(args, "outputUnigrams", true); outputUnigrams = getBoolean(args, "outputUnigrams", true);
outputUnigramsIfNoShingles = getBoolean(args, "outputUnigramsIfNoShingles", false); outputUnigramsIfNoShingles = getBoolean(args, "outputUnigramsIfNoShingles", false);
tokenSeparator = get(args, "tokenSeparator", ShingleFilter.TOKEN_SEPARATOR); tokenSeparator = get(args, "tokenSeparator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
fillerToken = get(args, "fillerToken", ShingleFilter.DEFAULT_FILLER_TOKEN);
if (!args.isEmpty()) { if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args); throw new IllegalArgumentException("Unknown parameters: " + args);
} }
@ -69,6 +71,7 @@ public class ShingleFilterFactory extends TokenFilterFactory {
r.setOutputUnigrams(outputUnigrams); r.setOutputUnigrams(outputUnigrams);
r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
r.setTokenSeparator(tokenSeparator); r.setTokenSeparator(tokenSeparator);
r.setFillerToken(fillerToken);
return r; return r;
} }
} }

View File

@ -21,9 +21,13 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField; import org.apache.lucene.document.TextField;
@ -169,7 +173,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 }); new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 });
analyzer = new ShingleAnalyzerWrapper( analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4, ShingleFilter.TOKEN_SEPARATOR, false, false); new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4,
ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide this sentence into shingles", assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
new String[] { "please divide this", "please divide this sentence", new String[] { "please divide this", "please divide this sentence",
"divide this sentence", "divide this sentence into", "divide this sentence", "divide this sentence into",
@ -195,7 +200,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 }); new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 });
analyzer = new ShingleAnalyzerWrapper( analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3, ShingleFilter.TOKEN_SEPARATOR, false, false); new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3,
ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide this sentence into shingles", assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
new String[] { "please divide this", new String[] { "please divide this",
"divide this sentence", "divide this sentence",
@ -211,7 +217,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"", true, false); "", true, false,
ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles", assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please", "pleasedivide", new String[] { "please", "pleasedivide",
"divide", "divideinto", "divide", "divideinto",
@ -225,7 +232,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"", false, false); "", false, false,
ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles", assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "pleasedivide", new String[] { "pleasedivide",
"divideinto", "divideinto",
@ -240,7 +248,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
null, true, false); null, true, false,
ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles", assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please", "pleasedivide", new String[] { "please", "pleasedivide",
"divide", "divideinto", "divide", "divideinto",
@ -254,7 +263,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"", false, false); "", false, false,
ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles", assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "pleasedivide", new String[] { "pleasedivide",
"divideinto", "divideinto",
@ -263,12 +273,14 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new int[] { 13, 18, 27 }, new int[] { 13, 18, 27 },
new int[] { 1, 1, 1 }); new int[] { 1, 1, 1 });
} }
public void testAltTokenSeparator() throws Exception { public void testAltTokenSeparator() throws Exception {
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper( ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"<SEP>", true, false); "<SEP>", true, false,
ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles", assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please", "please<SEP>divide", new String[] { "please", "please<SEP>divide",
"divide", "divide<SEP>into", "divide", "divide<SEP>into",
@ -282,7 +294,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"<SEP>", false, false); "<SEP>", false, false,
ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles", assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please<SEP>divide", new String[] { "please<SEP>divide",
"divide<SEP>into", "divide<SEP>into",
@ -291,13 +304,64 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new int[] { 13, 18, 27 }, new int[] { 13, 18, 27 },
new int[] { 1, 1, 1 }); new int[] { 1, 1, 1 });
} }
public void testAltFillerToken() throws Exception {
Analyzer delegate = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "into");
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filter = new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet);
return new TokenStreamComponents(tokenizer, filter);
}
};
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
delegate,
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
true, false, "--");
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please", "please divide",
"divide", "divide --",
"-- shingles", "shingles" },
new int[] { 0, 0, 7, 7, 19, 19 },
new int[] { 6, 13, 13, 19, 27, 27 },
new int[] { 1, 0, 1, 0, 1, 1 });
analyzer = new ShingleAnalyzerWrapper(
delegate,
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
false, false, null);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please divide", "divide ", " shingles" },
new int[] { 0, 7, 19 },
new int[] { 13, 19, 27 },
new int[] { 1, 1, 1 });
analyzer = new ShingleAnalyzerWrapper(
delegate,
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
false, false, "");
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please divide", "divide ", " shingles" },
new int[] { 0, 7, 19 },
new int[] { 13, 19, 27 },
new int[] { 1, 1, 1 });
}
public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception { public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper( ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"", false, true); "", false, true,
ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please", assertAnalyzesTo(analyzer, "please",
new String[] { "please" }, new String[] { "please" },
new int[] { 0 }, new int[] { 0 },

View File

@ -1196,4 +1196,52 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
new int[] {1, 0, 0, 1, 0, 0}, new int[] {1, 0, 0, 1, 0, 0},
20); 20);
} }
public void testTwoTrailingHolesTriShingleWithTokenFiller() throws IOException {
// Analyzing "purple wizard of the", where of and the are removed as a
// stopwords, leaving two trailing holes:
Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)};
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
filter.setFillerToken("--");
assertTokenStreamContents(filter,
new String[]{"purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --"},
new int[]{0, 0, 0, 7, 7, 7},
new int[]{6, 13, 20, 13, 20, 20},
new int[]{1, 0, 0, 1, 0, 0},
20);
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
filter.setFillerToken("");
assertTokenStreamContents(filter,
new String[]{"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard "},
new int[]{0, 0, 0, 7, 7, 7},
new int[]{6, 13, 20, 13, 20, 20},
new int[]{1, 0, 0, 1, 0, 0},
20);
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
filter.setFillerToken(null);
assertTokenStreamContents(filter,
new String[] {"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard "},
new int[] {0, 0, 0, 7, 7, 7},
new int[] {6, 13, 20, 13, 20, 20},
new int[] {1, 0, 0, 1, 0, 0},
20);
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
filter.setFillerToken(null);
filter.setTokenSeparator(null);
assertTokenStreamContents(filter,
new String[] {"purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard"},
new int[] {0, 0, 0, 7, 7, 7},
new int[] {6, 13, 20, 13, 20, 20},
new int[] {1, 0, 0, 1, 0, 0},
20);
}
} }