mirror of https://github.com/apache/lucene.git
LUCENE-5353: ShingleFilter's filler token should be configurable
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1562639 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4e9a524eda
commit
9feffa8472
|
@ -119,6 +119,9 @@ New Features
|
||||||
encode term metadata, and all dictionary implementations can now plug in any
|
encode term metadata, and all dictionary implementations can now plug in any
|
||||||
PostingsBaseFormat. (Han Jiang, Mike McCandless)
|
PostingsBaseFormat. (Han Jiang, Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-5353: ShingleFilter's filler token should be configurable.
|
||||||
|
(Ahmet Arslan, Simon Willnauer, Steve Rowe)
|
||||||
|
|
||||||
Build
|
Build
|
||||||
|
|
||||||
* LUCENE-5217,LUCENE-5420: Maven config: get dependencies from Ant+Ivy config;
|
* LUCENE-5217,LUCENE-5420: Maven config: get dependencies from Ant+Ivy config;
|
||||||
|
|
|
@ -36,6 +36,7 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
|
||||||
private final String tokenSeparator;
|
private final String tokenSeparator;
|
||||||
private final boolean outputUnigrams;
|
private final boolean outputUnigrams;
|
||||||
private final boolean outputUnigramsIfNoShingles;
|
private final boolean outputUnigramsIfNoShingles;
|
||||||
|
private final String fillerToken;
|
||||||
|
|
||||||
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
|
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
|
||||||
this(defaultAnalyzer, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
|
this(defaultAnalyzer, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
|
||||||
|
@ -46,7 +47,8 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
|
||||||
}
|
}
|
||||||
|
|
||||||
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) {
|
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) {
|
||||||
this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.TOKEN_SEPARATOR, true, false);
|
this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
|
||||||
|
true, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -63,6 +65,7 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
|
||||||
* minShingleSize tokens in the input stream)?
|
* minShingleSize tokens in the input stream)?
|
||||||
* Note that if outputUnigrams==true, then unigrams are always output,
|
* Note that if outputUnigrams==true, then unigrams are always output,
|
||||||
* regardless of whether any shingles are available.
|
* regardless of whether any shingles are available.
|
||||||
|
* @param fillerToken filler token to use when positionIncrement is more than 1
|
||||||
*/
|
*/
|
||||||
public ShingleAnalyzerWrapper(
|
public ShingleAnalyzerWrapper(
|
||||||
Analyzer delegate,
|
Analyzer delegate,
|
||||||
|
@ -70,7 +73,8 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
|
||||||
int maxShingleSize,
|
int maxShingleSize,
|
||||||
String tokenSeparator,
|
String tokenSeparator,
|
||||||
boolean outputUnigrams,
|
boolean outputUnigrams,
|
||||||
boolean outputUnigramsIfNoShingles) {
|
boolean outputUnigramsIfNoShingles,
|
||||||
|
String fillerToken) {
|
||||||
super(delegate.getReuseStrategy());
|
super(delegate.getReuseStrategy());
|
||||||
this.delegate = delegate;
|
this.delegate = delegate;
|
||||||
|
|
||||||
|
@ -91,6 +95,7 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
|
||||||
this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator);
|
this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator);
|
||||||
this.outputUnigrams = outputUnigrams;
|
this.outputUnigrams = outputUnigrams;
|
||||||
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
|
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
|
||||||
|
this.fillerToken = fillerToken;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -137,6 +142,10 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
|
||||||
return outputUnigramsIfNoShingles;
|
return outputUnigramsIfNoShingles;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getFillerToken() {
|
||||||
|
return fillerToken;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final Analyzer getWrappedAnalyzer(String fieldName) {
|
public final Analyzer getWrappedAnalyzer(String fieldName) {
|
||||||
return delegate;
|
return delegate;
|
||||||
|
@ -150,6 +159,7 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
|
||||||
filter.setTokenSeparator(tokenSeparator);
|
filter.setTokenSeparator(tokenSeparator);
|
||||||
filter.setOutputUnigrams(outputUnigrams);
|
filter.setOutputUnigrams(outputUnigrams);
|
||||||
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
||||||
|
filter.setFillerToken(fillerToken);
|
||||||
return new TokenStreamComponents(components.getTokenizer(), filter);
|
return new TokenStreamComponents(components.getTokenizer(), filter);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -47,7 +47,7 @@ public final class ShingleFilter extends TokenFilter {
|
||||||
/**
|
/**
|
||||||
* filler token for when positionIncrement is more than 1
|
* filler token for when positionIncrement is more than 1
|
||||||
*/
|
*/
|
||||||
public static final char[] FILLER_TOKEN = { '_' };
|
public static final String DEFAULT_FILLER_TOKEN = "_";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* default maximum shingle size is 2.
|
* default maximum shingle size is 2.
|
||||||
|
@ -67,7 +67,7 @@ public final class ShingleFilter extends TokenFilter {
|
||||||
/**
|
/**
|
||||||
* The default string to use when joining adjacent tokens to form a shingle
|
* The default string to use when joining adjacent tokens to form a shingle
|
||||||
*/
|
*/
|
||||||
public static final String TOKEN_SEPARATOR = " ";
|
public static final String DEFAULT_TOKEN_SEPARATOR = " ";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The sequence of input stream tokens (or filler tokens, if necessary)
|
* The sequence of input stream tokens (or filler tokens, if necessary)
|
||||||
|
@ -95,7 +95,13 @@ public final class ShingleFilter extends TokenFilter {
|
||||||
/**
|
/**
|
||||||
* The string to use when joining adjacent tokens to form a shingle
|
* The string to use when joining adjacent tokens to form a shingle
|
||||||
*/
|
*/
|
||||||
private String tokenSeparator = TOKEN_SEPARATOR;
|
private String tokenSeparator = DEFAULT_TOKEN_SEPARATOR;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The string to insert for each position at which there is no token
|
||||||
|
* (i.e., when position increment is greater than one).
|
||||||
|
*/
|
||||||
|
private char[] fillerToken = DEFAULT_FILLER_TOKEN.toCharArray();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* By default, we output unigrams (individual tokens) as well as shingles
|
* By default, we output unigrams (individual tokens) as well as shingles
|
||||||
|
@ -284,6 +290,16 @@ public final class ShingleFilter extends TokenFilter {
|
||||||
this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator;
|
this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the string to insert for each position at which there is no token
|
||||||
|
* (i.e., when position increment is greater than one).
|
||||||
|
*
|
||||||
|
* @param fillerToken string to insert at each position where there is no token
|
||||||
|
*/
|
||||||
|
public void setFillerToken(String fillerToken) {
|
||||||
|
this.fillerToken = null == fillerToken ? new char[0] : fillerToken.toCharArray();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
boolean tokenAvailable = false;
|
boolean tokenAvailable = false;
|
||||||
|
@ -341,7 +357,7 @@ public final class ShingleFilter extends TokenFilter {
|
||||||
/**
|
/**
|
||||||
* <p>Get the next token from the input stream.
|
* <p>Get the next token from the input stream.
|
||||||
* <p>If the next token has <code>positionIncrement > 1</code>,
|
* <p>If the next token has <code>positionIncrement > 1</code>,
|
||||||
* <code>positionIncrement - 1</code> {@link #FILLER_TOKEN}s are
|
* <code>positionIncrement - 1</code> {@link #fillerToken}s are
|
||||||
* inserted first.
|
* inserted first.
|
||||||
* @param target Where to put the new token; if null, a new instance is created.
|
* @param target Where to put the new token; if null, a new instance is created.
|
||||||
* @return On success, the populated token; null otherwise
|
* @return On success, the populated token; null otherwise
|
||||||
|
@ -359,7 +375,7 @@ public final class ShingleFilter extends TokenFilter {
|
||||||
// A filler token occupies no space
|
// A filler token occupies no space
|
||||||
newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(),
|
newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(),
|
||||||
newTarget.offsetAtt.startOffset());
|
newTarget.offsetAtt.startOffset());
|
||||||
newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
|
newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
|
||||||
newTarget.isFiller = true;
|
newTarget.isFiller = true;
|
||||||
--numFillerTokensToInsert;
|
--numFillerTokensToInsert;
|
||||||
} else if (isNextInputStreamToken) {
|
} else if (isNextInputStreamToken) {
|
||||||
|
@ -390,7 +406,7 @@ public final class ShingleFilter extends TokenFilter {
|
||||||
isNextInputStreamToken = true;
|
isNextInputStreamToken = true;
|
||||||
// A filler token occupies no space
|
// A filler token occupies no space
|
||||||
newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
|
newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
|
||||||
newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
|
newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
|
||||||
newTarget.isFiller = true;
|
newTarget.isFiller = true;
|
||||||
--numFillerTokensToInsert;
|
--numFillerTokensToInsert;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -29,7 +29,7 @@ import java.util.Map;
|
||||||
* <analyzer>
|
* <analyzer>
|
||||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
* <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="2"
|
* <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="2"
|
||||||
* outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" "/>
|
* outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" " fillerToken="_"/>
|
||||||
* </analyzer>
|
* </analyzer>
|
||||||
* </fieldType></pre>
|
* </fieldType></pre>
|
||||||
*/
|
*/
|
||||||
|
@ -39,6 +39,7 @@ public class ShingleFilterFactory extends TokenFilterFactory {
|
||||||
private final boolean outputUnigrams;
|
private final boolean outputUnigrams;
|
||||||
private final boolean outputUnigramsIfNoShingles;
|
private final boolean outputUnigramsIfNoShingles;
|
||||||
private final String tokenSeparator;
|
private final String tokenSeparator;
|
||||||
|
private final String fillerToken;
|
||||||
|
|
||||||
/** Creates a new ShingleFilterFactory */
|
/** Creates a new ShingleFilterFactory */
|
||||||
public ShingleFilterFactory(Map<String, String> args) {
|
public ShingleFilterFactory(Map<String, String> args) {
|
||||||
|
@ -57,7 +58,8 @@ public class ShingleFilterFactory extends TokenFilterFactory {
|
||||||
}
|
}
|
||||||
outputUnigrams = getBoolean(args, "outputUnigrams", true);
|
outputUnigrams = getBoolean(args, "outputUnigrams", true);
|
||||||
outputUnigramsIfNoShingles = getBoolean(args, "outputUnigramsIfNoShingles", false);
|
outputUnigramsIfNoShingles = getBoolean(args, "outputUnigramsIfNoShingles", false);
|
||||||
tokenSeparator = get(args, "tokenSeparator", ShingleFilter.TOKEN_SEPARATOR);
|
tokenSeparator = get(args, "tokenSeparator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
|
||||||
|
fillerToken = get(args, "fillerToken", ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||||
if (!args.isEmpty()) {
|
if (!args.isEmpty()) {
|
||||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
}
|
}
|
||||||
|
@ -69,6 +71,7 @@ public class ShingleFilterFactory extends TokenFilterFactory {
|
||||||
r.setOutputUnigrams(outputUnigrams);
|
r.setOutputUnigrams(outputUnigrams);
|
||||||
r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
||||||
r.setTokenSeparator(tokenSeparator);
|
r.setTokenSeparator(tokenSeparator);
|
||||||
|
r.setFillerToken(fillerToken);
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,9 +21,13 @@ import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.StopFilter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.TextField;
|
import org.apache.lucene.document.TextField;
|
||||||
|
@ -169,7 +173,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 });
|
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 });
|
||||||
|
|
||||||
analyzer = new ShingleAnalyzerWrapper(
|
analyzer = new ShingleAnalyzerWrapper(
|
||||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4, ShingleFilter.TOKEN_SEPARATOR, false, false);
|
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4,
|
||||||
|
ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||||
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
|
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
|
||||||
new String[] { "please divide this", "please divide this sentence",
|
new String[] { "please divide this", "please divide this sentence",
|
||||||
"divide this sentence", "divide this sentence into",
|
"divide this sentence", "divide this sentence into",
|
||||||
|
@ -195,7 +200,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 });
|
new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 });
|
||||||
|
|
||||||
analyzer = new ShingleAnalyzerWrapper(
|
analyzer = new ShingleAnalyzerWrapper(
|
||||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3, ShingleFilter.TOKEN_SEPARATOR, false, false);
|
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3,
|
||||||
|
ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||||
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
|
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
|
||||||
new String[] { "please divide this",
|
new String[] { "please divide this",
|
||||||
"divide this sentence",
|
"divide this sentence",
|
||||||
|
@ -211,7 +217,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
||||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||||
"", true, false);
|
"", true, false,
|
||||||
|
ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||||
new String[] { "please", "pleasedivide",
|
new String[] { "please", "pleasedivide",
|
||||||
"divide", "divideinto",
|
"divide", "divideinto",
|
||||||
|
@ -225,7 +232,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
||||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||||
"", false, false);
|
"", false, false,
|
||||||
|
ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||||
new String[] { "pleasedivide",
|
new String[] { "pleasedivide",
|
||||||
"divideinto",
|
"divideinto",
|
||||||
|
@ -240,7 +248,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
||||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||||
null, true, false);
|
null, true, false,
|
||||||
|
ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||||
new String[] { "please", "pleasedivide",
|
new String[] { "please", "pleasedivide",
|
||||||
"divide", "divideinto",
|
"divide", "divideinto",
|
||||||
|
@ -254,7 +263,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
||||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||||
"", false, false);
|
"", false, false,
|
||||||
|
ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||||
new String[] { "pleasedivide",
|
new String[] { "pleasedivide",
|
||||||
"divideinto",
|
"divideinto",
|
||||||
|
@ -263,12 +273,14 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
new int[] { 13, 18, 27 },
|
new int[] { 13, 18, 27 },
|
||||||
new int[] { 1, 1, 1 });
|
new int[] { 1, 1, 1 });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testAltTokenSeparator() throws Exception {
|
public void testAltTokenSeparator() throws Exception {
|
||||||
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
|
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
|
||||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
||||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||||
"<SEP>", true, false);
|
"<SEP>", true, false,
|
||||||
|
ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||||
new String[] { "please", "please<SEP>divide",
|
new String[] { "please", "please<SEP>divide",
|
||||||
"divide", "divide<SEP>into",
|
"divide", "divide<SEP>into",
|
||||||
|
@ -282,7 +294,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
||||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||||
"<SEP>", false, false);
|
"<SEP>", false, false,
|
||||||
|
ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||||
assertAnalyzesTo(analyzer, "please divide into shingles",
|
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||||
new String[] { "please<SEP>divide",
|
new String[] { "please<SEP>divide",
|
||||||
"divide<SEP>into",
|
"divide<SEP>into",
|
||||||
|
@ -291,13 +304,64 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
new int[] { 13, 18, 27 },
|
new int[] { 13, 18, 27 },
|
||||||
new int[] { 1, 1, 1 });
|
new int[] { 1, 1, 1 });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testAltFillerToken() throws Exception {
|
||||||
|
Analyzer delegate = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "into");
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenFilter filter = new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet);
|
||||||
|
return new TokenStreamComponents(tokenizer, filter);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
|
||||||
|
delegate,
|
||||||
|
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||||
|
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||||
|
ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
|
||||||
|
true, false, "--");
|
||||||
|
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||||
|
new String[] { "please", "please divide",
|
||||||
|
"divide", "divide --",
|
||||||
|
"-- shingles", "shingles" },
|
||||||
|
new int[] { 0, 0, 7, 7, 19, 19 },
|
||||||
|
new int[] { 6, 13, 13, 19, 27, 27 },
|
||||||
|
new int[] { 1, 0, 1, 0, 1, 1 });
|
||||||
|
|
||||||
|
analyzer = new ShingleAnalyzerWrapper(
|
||||||
|
delegate,
|
||||||
|
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||||
|
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||||
|
ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
|
||||||
|
false, false, null);
|
||||||
|
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||||
|
new String[] { "please divide", "divide ", " shingles" },
|
||||||
|
new int[] { 0, 7, 19 },
|
||||||
|
new int[] { 13, 19, 27 },
|
||||||
|
new int[] { 1, 1, 1 });
|
||||||
|
|
||||||
|
analyzer = new ShingleAnalyzerWrapper(
|
||||||
|
delegate,
|
||||||
|
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||||
|
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||||
|
ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
|
||||||
|
false, false, "");
|
||||||
|
assertAnalyzesTo(analyzer, "please divide into shingles",
|
||||||
|
new String[] { "please divide", "divide ", " shingles" },
|
||||||
|
new int[] { 0, 7, 19 },
|
||||||
|
new int[] { 13, 19, 27 },
|
||||||
|
new int[] { 1, 1, 1 });
|
||||||
|
}
|
||||||
|
|
||||||
public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
|
public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
|
||||||
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
|
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
|
||||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
|
||||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||||
"", false, true);
|
"", false, true,
|
||||||
|
ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||||
assertAnalyzesTo(analyzer, "please",
|
assertAnalyzesTo(analyzer, "please",
|
||||||
new String[] { "please" },
|
new String[] { "please" },
|
||||||
new int[] { 0 },
|
new int[] { 0 },
|
||||||
|
|
|
@ -1196,4 +1196,52 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
||||||
new int[] {1, 0, 0, 1, 0, 0},
|
new int[] {1, 0, 0, 1, 0, 0},
|
||||||
20);
|
20);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testTwoTrailingHolesTriShingleWithTokenFiller() throws IOException {
|
||||||
|
// Analyzing "purple wizard of the", where of and the are removed as a
|
||||||
|
// stopwords, leaving two trailing holes:
|
||||||
|
Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)};
|
||||||
|
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
|
||||||
|
filter.setFillerToken("--");
|
||||||
|
|
||||||
|
assertTokenStreamContents(filter,
|
||||||
|
new String[]{"purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --"},
|
||||||
|
new int[]{0, 0, 0, 7, 7, 7},
|
||||||
|
new int[]{6, 13, 20, 13, 20, 20},
|
||||||
|
new int[]{1, 0, 0, 1, 0, 0},
|
||||||
|
20);
|
||||||
|
|
||||||
|
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
|
||||||
|
filter.setFillerToken("");
|
||||||
|
|
||||||
|
assertTokenStreamContents(filter,
|
||||||
|
new String[]{"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard "},
|
||||||
|
new int[]{0, 0, 0, 7, 7, 7},
|
||||||
|
new int[]{6, 13, 20, 13, 20, 20},
|
||||||
|
new int[]{1, 0, 0, 1, 0, 0},
|
||||||
|
20);
|
||||||
|
|
||||||
|
|
||||||
|
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
|
||||||
|
filter.setFillerToken(null);
|
||||||
|
|
||||||
|
assertTokenStreamContents(filter,
|
||||||
|
new String[] {"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard "},
|
||||||
|
new int[] {0, 0, 0, 7, 7, 7},
|
||||||
|
new int[] {6, 13, 20, 13, 20, 20},
|
||||||
|
new int[] {1, 0, 0, 1, 0, 0},
|
||||||
|
20);
|
||||||
|
|
||||||
|
|
||||||
|
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
|
||||||
|
filter.setFillerToken(null);
|
||||||
|
filter.setTokenSeparator(null);
|
||||||
|
|
||||||
|
assertTokenStreamContents(filter,
|
||||||
|
new String[] {"purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard"},
|
||||||
|
new int[] {0, 0, 0, 7, 7, 7},
|
||||||
|
new int[] {6, 13, 20, 13, 20, 20},
|
||||||
|
new int[] {1, 0, 0, 1, 0, 0},
|
||||||
|
20);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue