SOLR-1740: ShingleFilterFactory improvements

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@930163 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-04-02 05:00:53 +00:00
parent 88fc2e2f56
commit 47416b3948
3 changed files with 179 additions and 1 deletions

View File

@ -135,6 +135,10 @@ New Features
TokenFilters now support custom Attributes, and some have improved performance: TokenFilters now support custom Attributes, and some have improved performance:
especially WordDelimiterFilter and CommonGramsFilter. (rmuir, cmale, uschindler) especially WordDelimiterFilter and CommonGramsFilter. (rmuir, cmale, uschindler)
* SOLR-1740: ShingleFilterFactory supports the "minShingleSize" and "tokenSeparator"
parameters for controlling the minimum shingle size produced by the filter, and
the separator string that it uses, respectively. (Steven Rowe via rmuir)
Optimizations Optimizations
---------------------- ----------------------

View File

@ -21,21 +21,49 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import java.util.Map; import java.util.Map;
/** Factory for {@link ShingleFilter} */ /** Factory for {@link ShingleFilter} */
public class ShingleFilterFactory extends BaseTokenFilterFactory { public class ShingleFilterFactory extends BaseTokenFilterFactory {
private int minShingleSize;
private int maxShingleSize; private int maxShingleSize;
private boolean outputUnigrams; private boolean outputUnigrams;
private String tokenSeparator;
public void init(Map<String, String> args) { public void init(Map<String, String> args) {
super.init(args); super.init(args);
maxShingleSize = getInt("maxShingleSize", maxShingleSize = getInt("maxShingleSize",
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
if (maxShingleSize < 2) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"Invalid maxShingleSize (" + maxShingleSize
+ ") - must be at least 2");
}
minShingleSize = getInt("minShingleSize",
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
if (minShingleSize < 2) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"Invalid minShingleSize (" + minShingleSize
+ ") - must be at least 2");
}
if (minShingleSize > maxShingleSize) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"Invalid minShingleSize (" + minShingleSize
+ ") - must be no greater than maxShingleSize ("
+ maxShingleSize + ")");
}
outputUnigrams = getBoolean("outputUnigrams", true); outputUnigrams = getBoolean("outputUnigrams", true);
tokenSeparator = args.containsKey("tokenSeparator")
? args.get("tokenSeparator")
: ShingleFilter.TOKEN_SEPARATOR;
} }
public ShingleFilter create(TokenStream input) { public ShingleFilter create(TokenStream input) {
ShingleFilter r = new ShingleFilter(input,maxShingleSize); ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);
r.setOutputUnigrams(outputUnigrams); r.setOutputUnigrams(outputUnigrams);
r.setTokenSeparator(tokenSeparator);
return r; return r;
} }
} }

View File

@ -70,4 +70,150 @@ public class TestShingleFilterFactory extends BaseTokenTestCase {
new String[] {"this", "this is", "this is a", "is", new String[] {"this", "this is", "this is a", "is",
"is a", "is a test", "a", "a test", "test"}); "is a", "is a test", "a", "a test", "test"});
} }
/**
* Test with higher min (and max) shingle size
*/
public void testMinShingleSize() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("minShingleSize", "3");
args.put("maxShingleSize", "4");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream,
new String[] { "this", "this is a", "this is a test",
"is", "is a test", "a", "test" });
}
/**
* Test with higher min (and max) shingle size and with unigrams disabled
*/
public void testMinShingleSizeNoUnigrams() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("minShingleSize", "3");
args.put("maxShingleSize", "4");
args.put("outputUnigrams", "false");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream,
new String[] { "this is a", "this is a test", "is a test" });
}
/**
* Test with higher same min and max shingle size
*/
public void testEqualMinAndMaxShingleSize() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("minShingleSize", "3");
args.put("maxShingleSize", "3");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream,
new String[] { "this", "this is a", "is", "is a test", "a", "test" });
}
/**
* Test with higher same min and max shingle size and with unigrams disabled
*/
public void testEqualMinAndMaxShingleSizeNoUnigrams() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("minShingleSize", "3");
args.put("maxShingleSize", "3");
args.put("outputUnigrams", "false");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream,
new String[] { "this is a", "is a test" });
}
/**
* Test with a non-default token separator
*/
public void testTokenSeparator() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("tokenSeparator", "=BLAH=");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream,
new String[] { "this", "this=BLAH=is", "is", "is=BLAH=a",
"a", "a=BLAH=test", "test" });
}
/**
* Test with a non-default token separator and with unigrams disabled
*/
public void testTokenSeparatorNoUnigrams() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("tokenSeparator", "=BLAH=");
args.put("outputUnigrams", "false");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream,
new String[] { "this=BLAH=is", "is=BLAH=a", "a=BLAH=test" });
}
/**
* Test with an empty token separator
*/
public void testEmptyTokenSeparator() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("tokenSeparator", "");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream,
new String[] { "this", "thisis", "is", "isa", "a", "atest", "test" });
}
/**
* Test with higher min (and max) shingle size
* and with a non-default token separator
*/
public void testMinShingleSizeAndTokenSeparator() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("minShingleSize", "3");
args.put("maxShingleSize", "4");
args.put("tokenSeparator", "=BLAH=");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream,
new String[] { "this", "this=BLAH=is=BLAH=a",
"this=BLAH=is=BLAH=a=BLAH=test", "is",
"is=BLAH=a=BLAH=test", "a", "test" });
}
/**
* Test with higher min (and max) shingle size
* and with a non-default token separator
* and with unigrams disabled
*/
public void testMinShingleSizeAndTokenSeparatorNoUnigrams() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("minShingleSize", "3");
args.put("maxShingleSize", "4");
args.put("tokenSeparator", "=BLAH=");
args.put("outputUnigrams", "false");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream,
new String[] { "this=BLAH=is=BLAH=a", "this=BLAH=is=BLAH=a=BLAH=test",
"is=BLAH=a=BLAH=test", });
}
} }