SOLR-1740: ShingleFilterFactory improvements

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@930163 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-04-02 05:00:53 +00:00
parent 88fc2e2f56
commit 47416b3948
3 changed files with 179 additions and 1 deletions

View File

@ -135,6 +135,10 @@ New Features
TokenFilters now support custom Attributes, and some have improved performance:
especially WordDelimiterFilter and CommonGramsFilter. (rmuir, cmale, uschindler)
* SOLR-1740: ShingleFilterFactory supports the "minShingleSize" and "tokenSeparator"
parameters for controlling the minimum shingle size produced by the filter, and
the separator string that it uses, respectively. (Steven Rowe via rmuir)
Optimizations
----------------------

View File

@ -21,21 +21,49 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import java.util.Map;
/** Factory for {@link ShingleFilter} */
public class ShingleFilterFactory extends BaseTokenFilterFactory {
private int minShingleSize;
private int maxShingleSize;
private boolean outputUnigrams;
private String tokenSeparator;
public void init(Map<String, String> args) {
super.init(args);
maxShingleSize = getInt("maxShingleSize",
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
if (maxShingleSize < 2) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"Invalid maxShingleSize (" + maxShingleSize
+ ") - must be at least 2");
}
minShingleSize = getInt("minShingleSize",
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
if (minShingleSize < 2) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"Invalid minShingleSize (" + minShingleSize
+ ") - must be at least 2");
}
if (minShingleSize > maxShingleSize) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"Invalid minShingleSize (" + minShingleSize
+ ") - must be no greater than maxShingleSize ("
+ maxShingleSize + ")");
}
outputUnigrams = getBoolean("outputUnigrams", true);
tokenSeparator = args.containsKey("tokenSeparator")
? args.get("tokenSeparator")
: ShingleFilter.TOKEN_SEPARATOR;
}
public ShingleFilter create(TokenStream input) {
ShingleFilter r = new ShingleFilter(input,maxShingleSize);
ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);
r.setOutputUnigrams(outputUnigrams);
r.setTokenSeparator(tokenSeparator);
return r;
}
}

View File

@ -70,4 +70,150 @@ public class TestShingleFilterFactory extends BaseTokenTestCase {
new String[] {"this", "this is", "this is a", "is",
"is a", "is a test", "a", "a test", "test"});
}
/**
* Test with higher min (and max) shingle size
*/
public void testMinShingleSize() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("minShingleSize", "3");
args.put("maxShingleSize", "4");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream,
new String[] { "this", "this is a", "this is a test",
"is", "is a test", "a", "test" });
}
/**
* Test with higher min (and max) shingle size and with unigrams disabled
*/
public void testMinShingleSizeNoUnigrams() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("minShingleSize", "3");
args.put("maxShingleSize", "4");
args.put("outputUnigrams", "false");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream,
new String[] { "this is a", "this is a test", "is a test" });
}
/**
* Test with higher same min and max shingle size
*/
public void testEqualMinAndMaxShingleSize() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("minShingleSize", "3");
args.put("maxShingleSize", "3");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream,
new String[] { "this", "this is a", "is", "is a test", "a", "test" });
}
/**
* Test with higher same min and max shingle size and with unigrams disabled
*/
public void testEqualMinAndMaxShingleSizeNoUnigrams() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("minShingleSize", "3");
args.put("maxShingleSize", "3");
args.put("outputUnigrams", "false");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream,
new String[] { "this is a", "is a test" });
}
/**
* Test with a non-default token separator
*/
public void testTokenSeparator() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("tokenSeparator", "=BLAH=");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream,
new String[] { "this", "this=BLAH=is", "is", "is=BLAH=a",
"a", "a=BLAH=test", "test" });
}
/**
* Test with a non-default token separator and with unigrams disabled
*/
public void testTokenSeparatorNoUnigrams() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("tokenSeparator", "=BLAH=");
args.put("outputUnigrams", "false");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream,
new String[] { "this=BLAH=is", "is=BLAH=a", "a=BLAH=test" });
}
/**
* Test with an empty token separator
*/
public void testEmptyTokenSeparator() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("tokenSeparator", "");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream,
new String[] { "this", "thisis", "is", "isa", "a", "atest", "test" });
}
/**
* Test with higher min (and max) shingle size
* and with a non-default token separator
*/
public void testMinShingleSizeAndTokenSeparator() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("minShingleSize", "3");
args.put("maxShingleSize", "4");
args.put("tokenSeparator", "=BLAH=");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream,
new String[] { "this", "this=BLAH=is=BLAH=a",
"this=BLAH=is=BLAH=a=BLAH=test", "is",
"is=BLAH=a=BLAH=test", "a", "test" });
}
/**
* Test with higher min (and max) shingle size
* and with a non-default token separator
* and with unigrams disabled
*/
public void testMinShingleSizeAndTokenSeparatorNoUnigrams() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("minShingleSize", "3");
args.put("maxShingleSize", "4");
args.put("tokenSeparator", "=BLAH=");
args.put("outputUnigrams", "false");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream,
new String[] { "this=BLAH=is=BLAH=a", "this=BLAH=is=BLAH=a=BLAH=test",
"is=BLAH=a=BLAH=test", });
}
}