mirror of https://github.com/apache/lucene.git
SOLR-1740: ShingleFilterFactory improvements
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@930163 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
88fc2e2f56
commit
47416b3948
|
@ -135,6 +135,10 @@ New Features
|
|||
TokenFilters now support custom Attributes, and some have improved performance:
|
||||
especially WordDelimiterFilter and CommonGramsFilter. (rmuir, cmale, uschindler)
|
||||
|
||||
* SOLR-1740: ShingleFilterFactory supports the "minShingleSize" and "tokenSeparator"
|
||||
parameters for controlling the minimum shingle size produced by the filter, and
|
||||
the separator string that it uses, respectively. (Steven Rowe via rmuir)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -21,21 +21,49 @@ package org.apache.solr.analysis;
|
|||
|
||||
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrException.ErrorCode;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/** Factory for {@link ShingleFilter} */
|
||||
public class ShingleFilterFactory extends BaseTokenFilterFactory {
|
||||
private int minShingleSize;
|
||||
private int maxShingleSize;
|
||||
private boolean outputUnigrams;
|
||||
private String tokenSeparator;
|
||||
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
maxShingleSize = getInt("maxShingleSize",
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
|
||||
if (maxShingleSize < 2) {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||
"Invalid maxShingleSize (" + maxShingleSize
|
||||
+ ") - must be at least 2");
|
||||
}
|
||||
minShingleSize = getInt("minShingleSize",
|
||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
|
||||
if (minShingleSize < 2) {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||
"Invalid minShingleSize (" + minShingleSize
|
||||
+ ") - must be at least 2");
|
||||
}
|
||||
if (minShingleSize > maxShingleSize) {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||
"Invalid minShingleSize (" + minShingleSize
|
||||
+ ") - must be no greater than maxShingleSize ("
|
||||
+ maxShingleSize + ")");
|
||||
}
|
||||
outputUnigrams = getBoolean("outputUnigrams", true);
|
||||
tokenSeparator = args.containsKey("tokenSeparator")
|
||||
? args.get("tokenSeparator")
|
||||
: ShingleFilter.TOKEN_SEPARATOR;
|
||||
}
|
||||
public ShingleFilter create(TokenStream input) {
|
||||
ShingleFilter r = new ShingleFilter(input,maxShingleSize);
|
||||
ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);
|
||||
r.setOutputUnigrams(outputUnigrams);
|
||||
r.setTokenSeparator(tokenSeparator);
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -70,4 +70,150 @@ public class TestShingleFilterFactory extends BaseTokenTestCase {
|
|||
new String[] {"this", "this is", "this is a", "is",
|
||||
"is a", "is a test", "a", "a test", "test"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test with higher min (and max) shingle size
|
||||
*/
|
||||
public void testMinShingleSize() throws Exception {
|
||||
Reader reader = new StringReader("this is a test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("minShingleSize", "3");
|
||||
args.put("maxShingleSize", "4");
|
||||
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "this", "this is a", "this is a test",
|
||||
"is", "is a test", "a", "test" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test with higher min (and max) shingle size and with unigrams disabled
|
||||
*/
|
||||
public void testMinShingleSizeNoUnigrams() throws Exception {
|
||||
Reader reader = new StringReader("this is a test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("minShingleSize", "3");
|
||||
args.put("maxShingleSize", "4");
|
||||
args.put("outputUnigrams", "false");
|
||||
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "this is a", "this is a test", "is a test" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test with higher same min and max shingle size
|
||||
*/
|
||||
public void testEqualMinAndMaxShingleSize() throws Exception {
|
||||
Reader reader = new StringReader("this is a test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("minShingleSize", "3");
|
||||
args.put("maxShingleSize", "3");
|
||||
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "this", "this is a", "is", "is a test", "a", "test" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test with higher same min and max shingle size and with unigrams disabled
|
||||
*/
|
||||
public void testEqualMinAndMaxShingleSizeNoUnigrams() throws Exception {
|
||||
Reader reader = new StringReader("this is a test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("minShingleSize", "3");
|
||||
args.put("maxShingleSize", "3");
|
||||
args.put("outputUnigrams", "false");
|
||||
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "this is a", "is a test" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test with a non-default token separator
|
||||
*/
|
||||
public void testTokenSeparator() throws Exception {
|
||||
Reader reader = new StringReader("this is a test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("tokenSeparator", "=BLAH=");
|
||||
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "this", "this=BLAH=is", "is", "is=BLAH=a",
|
||||
"a", "a=BLAH=test", "test" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test with a non-default token separator and with unigrams disabled
|
||||
*/
|
||||
public void testTokenSeparatorNoUnigrams() throws Exception {
|
||||
Reader reader = new StringReader("this is a test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("tokenSeparator", "=BLAH=");
|
||||
args.put("outputUnigrams", "false");
|
||||
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "this=BLAH=is", "is=BLAH=a", "a=BLAH=test" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test with an empty token separator
|
||||
*/
|
||||
public void testEmptyTokenSeparator() throws Exception {
|
||||
Reader reader = new StringReader("this is a test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("tokenSeparator", "");
|
||||
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "this", "thisis", "is", "isa", "a", "atest", "test" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test with higher min (and max) shingle size
|
||||
* and with a non-default token separator
|
||||
*/
|
||||
public void testMinShingleSizeAndTokenSeparator() throws Exception {
|
||||
Reader reader = new StringReader("this is a test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("minShingleSize", "3");
|
||||
args.put("maxShingleSize", "4");
|
||||
args.put("tokenSeparator", "=BLAH=");
|
||||
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "this", "this=BLAH=is=BLAH=a",
|
||||
"this=BLAH=is=BLAH=a=BLAH=test", "is",
|
||||
"is=BLAH=a=BLAH=test", "a", "test" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test with higher min (and max) shingle size
|
||||
* and with a non-default token separator
|
||||
* and with unigrams disabled
|
||||
*/
|
||||
public void testMinShingleSizeAndTokenSeparatorNoUnigrams() throws Exception {
|
||||
Reader reader = new StringReader("this is a test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("minShingleSize", "3");
|
||||
args.put("maxShingleSize", "4");
|
||||
args.put("tokenSeparator", "=BLAH=");
|
||||
args.put("outputUnigrams", "false");
|
||||
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "this=BLAH=is=BLAH=a", "this=BLAH=is=BLAH=a=BLAH=test",
|
||||
"is=BLAH=a=BLAH=test", });
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue