LUCENE-5353: ShingleFilter's filler token should be configurable

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1562639 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2014-01-29 23:34:48 +00:00
parent 4e9a524eda
commit 9feffa8472
6 changed files with 164 additions and 20 deletions

View File

@ -119,6 +119,9 @@ New Features
encode term metadata, and all dictionary implementations can now plug in any
PostingsBaseFormat. (Han Jiang, Mike McCandless)
* LUCENE-5353: ShingleFilter's filler token should be configurable.
(Ahmet Arslan, Simon Willnauer, Steve Rowe)
Build
* LUCENE-5217,LUCENE-5420: Maven config: get dependencies from Ant+Ivy config;

View File

@ -36,6 +36,7 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
private final String tokenSeparator;
private final boolean outputUnigrams;
private final boolean outputUnigramsIfNoShingles;
private final String fillerToken;
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
this(defaultAnalyzer, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
@ -46,7 +47,8 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
}
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) {
this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.TOKEN_SEPARATOR, true, false);
this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
true, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
}
/**
@ -63,6 +65,7 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
* minShingleSize tokens in the input stream)?
* Note that if outputUnigrams==true, then unigrams are always output,
* regardless of whether any shingles are available.
* @param fillerToken filler token to use when positionIncrement is more than 1
*/
public ShingleAnalyzerWrapper(
Analyzer delegate,
@ -70,7 +73,8 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
int maxShingleSize,
String tokenSeparator,
boolean outputUnigrams,
boolean outputUnigramsIfNoShingles) {
boolean outputUnigramsIfNoShingles,
String fillerToken) {
super(delegate.getReuseStrategy());
this.delegate = delegate;
@ -91,6 +95,7 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator);
this.outputUnigrams = outputUnigrams;
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
this.fillerToken = fillerToken;
}
/**
@ -137,6 +142,10 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
return outputUnigramsIfNoShingles;
}
public String getFillerToken() {
return fillerToken;
}
@Override
public final Analyzer getWrappedAnalyzer(String fieldName) {
return delegate;
@ -150,6 +159,7 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
filter.setTokenSeparator(tokenSeparator);
filter.setOutputUnigrams(outputUnigrams);
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
filter.setFillerToken(fillerToken);
return new TokenStreamComponents(components.getTokenizer(), filter);
}
}

View File

@ -47,7 +47,7 @@ public final class ShingleFilter extends TokenFilter {
/**
* filler token for when positionIncrement is more than 1
*/
public static final char[] FILLER_TOKEN = { '_' };
public static final String DEFAULT_FILLER_TOKEN = "_";
/**
* default maximum shingle size is 2.
@ -67,7 +67,7 @@ public final class ShingleFilter extends TokenFilter {
/**
* The default string to use when joining adjacent tokens to form a shingle
*/
public static final String TOKEN_SEPARATOR = " ";
public static final String DEFAULT_TOKEN_SEPARATOR = " ";
/**
* The sequence of input stream tokens (or filler tokens, if necessary)
@ -95,7 +95,13 @@ public final class ShingleFilter extends TokenFilter {
/**
* The string to use when joining adjacent tokens to form a shingle
*/
private String tokenSeparator = TOKEN_SEPARATOR;
private String tokenSeparator = DEFAULT_TOKEN_SEPARATOR;
/**
* The string to insert for each position at which there is no token
* (i.e., when position increment is greater than one).
*/
private char[] fillerToken = DEFAULT_FILLER_TOKEN.toCharArray();
/**
* By default, we output unigrams (individual tokens) as well as shingles
@ -284,6 +290,16 @@ public final class ShingleFilter extends TokenFilter {
this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator;
}
/**
* Sets the string to insert for each position at which there is no token
* (i.e., when position increment is greater than one).
*
* @param fillerToken string to insert at each position where there is no token
*/
public void setFillerToken(String fillerToken) {
this.fillerToken = null == fillerToken ? new char[0] : fillerToken.toCharArray();
}
@Override
public boolean incrementToken() throws IOException {
boolean tokenAvailable = false;
@ -341,7 +357,7 @@ public final class ShingleFilter extends TokenFilter {
/**
* <p>Get the next token from the input stream.
* <p>If the next token has <code>positionIncrement > 1</code>,
* <code>positionIncrement - 1</code> {@link #FILLER_TOKEN}s are
* <code>positionIncrement - 1</code> {@link #fillerToken}s are
* inserted first.
* @param target Where to put the new token; if null, a new instance is created.
* @return On success, the populated token; null otherwise
@ -359,7 +375,7 @@ public final class ShingleFilter extends TokenFilter {
// A filler token occupies no space
newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(),
newTarget.offsetAtt.startOffset());
newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else if (isNextInputStreamToken) {
@ -390,7 +406,7 @@ public final class ShingleFilter extends TokenFilter {
isNextInputStreamToken = true;
// A filler token occupies no space
newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else {

View File

@ -29,7 +29,7 @@ import java.util.Map;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="2"
* outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" "/&gt;
* outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" " fillerToken="_"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
@ -39,6 +39,7 @@ public class ShingleFilterFactory extends TokenFilterFactory {
private final boolean outputUnigrams;
private final boolean outputUnigramsIfNoShingles;
private final String tokenSeparator;
private final String fillerToken;
/** Creates a new ShingleFilterFactory */
public ShingleFilterFactory(Map<String, String> args) {
@ -57,7 +58,8 @@ public class ShingleFilterFactory extends TokenFilterFactory {
}
outputUnigrams = getBoolean(args, "outputUnigrams", true);
outputUnigramsIfNoShingles = getBoolean(args, "outputUnigramsIfNoShingles", false);
tokenSeparator = get(args, "tokenSeparator", ShingleFilter.TOKEN_SEPARATOR);
tokenSeparator = get(args, "tokenSeparator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
fillerToken = get(args, "fillerToken", ShingleFilter.DEFAULT_FILLER_TOKEN);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -69,6 +71,7 @@ public class ShingleFilterFactory extends TokenFilterFactory {
r.setOutputUnigrams(outputUnigrams);
r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
r.setTokenSeparator(tokenSeparator);
r.setFillerToken(fillerToken);
return r;
}
}

View File

@ -21,9 +21,13 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
@ -169,7 +173,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 });
analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4, ShingleFilter.TOKEN_SEPARATOR, false, false);
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4,
ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
new String[] { "please divide this", "please divide this sentence",
"divide this sentence", "divide this sentence into",
@ -195,7 +200,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 });
analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3, ShingleFilter.TOKEN_SEPARATOR, false, false);
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3,
ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
new String[] { "please divide this",
"divide this sentence",
@ -211,7 +217,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"", true, false);
"", true, false,
ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please", "pleasedivide",
"divide", "divideinto",
@ -225,7 +232,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"", false, false);
"", false, false,
ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "pleasedivide",
"divideinto",
@ -240,7 +248,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
null, true, false);
null, true, false,
ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please", "pleasedivide",
"divide", "divideinto",
@ -254,7 +263,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"", false, false);
"", false, false,
ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "pleasedivide",
"divideinto",
@ -263,12 +273,14 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new int[] { 13, 18, 27 },
new int[] { 1, 1, 1 });
}
public void testAltTokenSeparator() throws Exception {
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"<SEP>", true, false);
"<SEP>", true, false,
ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please", "please<SEP>divide",
"divide", "divide<SEP>into",
@ -282,7 +294,8 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"<SEP>", false, false);
"<SEP>", false, false,
ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please<SEP>divide",
"divide<SEP>into",
@ -291,13 +304,64 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new int[] { 13, 18, 27 },
new int[] { 1, 1, 1 });
}
public void testAltFillerToken() throws Exception {
Analyzer delegate = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "into");
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filter = new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet);
return new TokenStreamComponents(tokenizer, filter);
}
};
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
delegate,
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
true, false, "--");
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please", "please divide",
"divide", "divide --",
"-- shingles", "shingles" },
new int[] { 0, 0, 7, 7, 19, 19 },
new int[] { 6, 13, 13, 19, 27, 27 },
new int[] { 1, 0, 1, 0, 1, 1 });
analyzer = new ShingleAnalyzerWrapper(
delegate,
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
false, false, null);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please divide", "divide ", " shingles" },
new int[] { 0, 7, 19 },
new int[] { 13, 19, 27 },
new int[] { 1, 1, 1 });
analyzer = new ShingleAnalyzerWrapper(
delegate,
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
false, false, "");
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please divide", "divide ", " shingles" },
new int[] { 0, 7, 19 },
new int[] { 13, 19, 27 },
new int[] { 1, 1, 1 });
}
public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"", false, true);
"", false, true,
ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please",
new String[] { "please" },
new int[] { 0 },

View File

@ -1196,4 +1196,52 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
new int[] {1, 0, 0, 1, 0, 0},
20);
}
public void testTwoTrailingHolesTriShingleWithTokenFiller() throws IOException {
// Analyzing "purple wizard of the", where of and the are removed as a
// stopwords, leaving two trailing holes:
Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)};
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
filter.setFillerToken("--");
assertTokenStreamContents(filter,
new String[]{"purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --"},
new int[]{0, 0, 0, 7, 7, 7},
new int[]{6, 13, 20, 13, 20, 20},
new int[]{1, 0, 0, 1, 0, 0},
20);
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
filter.setFillerToken("");
assertTokenStreamContents(filter,
new String[]{"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard "},
new int[]{0, 0, 0, 7, 7, 7},
new int[]{6, 13, 20, 13, 20, 20},
new int[]{1, 0, 0, 1, 0, 0},
20);
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
filter.setFillerToken(null);
assertTokenStreamContents(filter,
new String[] {"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard "},
new int[] {0, 0, 0, 7, 7, 7},
new int[] {6, 13, 20, 13, 20, 20},
new int[] {1, 0, 0, 1, 0, 0},
20);
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
filter.setFillerToken(null);
filter.setTokenSeparator(null);
assertTokenStreamContents(filter,
new String[] {"purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard"},
new int[] {0, 0, 0, 7, 7, 7},
new int[] {6, 13, 20, 13, 20, 20},
new int[] {1, 0, 0, 1, 0, 0},
20);
}
}