mirror of https://github.com/apache/lucene.git
SOLR-1740: ShingleFilterFactory improvements
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@930163 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
88fc2e2f56
commit
47416b3948
|
@ -135,6 +135,10 @@ New Features
|
||||||
TokenFilters now support custom Attributes, and some have improved performance:
|
TokenFilters now support custom Attributes, and some have improved performance:
|
||||||
especially WordDelimiterFilter and CommonGramsFilter. (rmuir, cmale, uschindler)
|
especially WordDelimiterFilter and CommonGramsFilter. (rmuir, cmale, uschindler)
|
||||||
|
|
||||||
|
* SOLR-1740: ShingleFilterFactory supports the "minShingleSize" and "tokenSeparator"
|
||||||
|
parameters for controlling the minimum shingle size produced by the filter, and
|
||||||
|
the separator string that it uses, respectively. (Steven Rowe via rmuir)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -21,21 +21,49 @@ package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.solr.common.SolrException;
|
||||||
|
import org.apache.solr.common.SolrException.ErrorCode;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
/** Factory for {@link ShingleFilter} */
|
/** Factory for {@link ShingleFilter} */
|
||||||
public class ShingleFilterFactory extends BaseTokenFilterFactory {
|
public class ShingleFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
private int minShingleSize;
|
||||||
private int maxShingleSize;
|
private int maxShingleSize;
|
||||||
private boolean outputUnigrams;
|
private boolean outputUnigrams;
|
||||||
|
private String tokenSeparator;
|
||||||
|
|
||||||
public void init(Map<String, String> args) {
|
public void init(Map<String, String> args) {
|
||||||
super.init(args);
|
super.init(args);
|
||||||
maxShingleSize = getInt("maxShingleSize",
|
maxShingleSize = getInt("maxShingleSize",
|
||||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
|
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
|
||||||
|
if (maxShingleSize < 2) {
|
||||||
|
throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||||
|
"Invalid maxShingleSize (" + maxShingleSize
|
||||||
|
+ ") - must be at least 2");
|
||||||
|
}
|
||||||
|
minShingleSize = getInt("minShingleSize",
|
||||||
|
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
|
||||||
|
if (minShingleSize < 2) {
|
||||||
|
throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||||
|
"Invalid minShingleSize (" + minShingleSize
|
||||||
|
+ ") - must be at least 2");
|
||||||
|
}
|
||||||
|
if (minShingleSize > maxShingleSize) {
|
||||||
|
throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||||
|
"Invalid minShingleSize (" + minShingleSize
|
||||||
|
+ ") - must be no greater than maxShingleSize ("
|
||||||
|
+ maxShingleSize + ")");
|
||||||
|
}
|
||||||
outputUnigrams = getBoolean("outputUnigrams", true);
|
outputUnigrams = getBoolean("outputUnigrams", true);
|
||||||
|
tokenSeparator = args.containsKey("tokenSeparator")
|
||||||
|
? args.get("tokenSeparator")
|
||||||
|
: ShingleFilter.TOKEN_SEPARATOR;
|
||||||
}
|
}
|
||||||
public ShingleFilter create(TokenStream input) {
|
public ShingleFilter create(TokenStream input) {
|
||||||
ShingleFilter r = new ShingleFilter(input,maxShingleSize);
|
ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);
|
||||||
r.setOutputUnigrams(outputUnigrams);
|
r.setOutputUnigrams(outputUnigrams);
|
||||||
|
r.setTokenSeparator(tokenSeparator);
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -70,4 +70,150 @@ public class TestShingleFilterFactory extends BaseTokenTestCase {
|
||||||
new String[] {"this", "this is", "this is a", "is",
|
new String[] {"this", "this is", "this is a", "is",
|
||||||
"is a", "is a test", "a", "a test", "test"});
|
"is a", "is a test", "a", "a test", "test"});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test with higher min (and max) shingle size
|
||||||
|
*/
|
||||||
|
public void testMinShingleSize() throws Exception {
|
||||||
|
Reader reader = new StringReader("this is a test");
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("minShingleSize", "3");
|
||||||
|
args.put("maxShingleSize", "4");
|
||||||
|
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||||
|
factory.init(args);
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream,
|
||||||
|
new String[] { "this", "this is a", "this is a test",
|
||||||
|
"is", "is a test", "a", "test" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test with higher min (and max) shingle size and with unigrams disabled
|
||||||
|
*/
|
||||||
|
public void testMinShingleSizeNoUnigrams() throws Exception {
|
||||||
|
Reader reader = new StringReader("this is a test");
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("minShingleSize", "3");
|
||||||
|
args.put("maxShingleSize", "4");
|
||||||
|
args.put("outputUnigrams", "false");
|
||||||
|
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||||
|
factory.init(args);
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream,
|
||||||
|
new String[] { "this is a", "this is a test", "is a test" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test with higher same min and max shingle size
|
||||||
|
*/
|
||||||
|
public void testEqualMinAndMaxShingleSize() throws Exception {
|
||||||
|
Reader reader = new StringReader("this is a test");
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("minShingleSize", "3");
|
||||||
|
args.put("maxShingleSize", "3");
|
||||||
|
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||||
|
factory.init(args);
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream,
|
||||||
|
new String[] { "this", "this is a", "is", "is a test", "a", "test" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test with higher same min and max shingle size and with unigrams disabled
|
||||||
|
*/
|
||||||
|
public void testEqualMinAndMaxShingleSizeNoUnigrams() throws Exception {
|
||||||
|
Reader reader = new StringReader("this is a test");
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("minShingleSize", "3");
|
||||||
|
args.put("maxShingleSize", "3");
|
||||||
|
args.put("outputUnigrams", "false");
|
||||||
|
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||||
|
factory.init(args);
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream,
|
||||||
|
new String[] { "this is a", "is a test" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test with a non-default token separator
|
||||||
|
*/
|
||||||
|
public void testTokenSeparator() throws Exception {
|
||||||
|
Reader reader = new StringReader("this is a test");
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("tokenSeparator", "=BLAH=");
|
||||||
|
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||||
|
factory.init(args);
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream,
|
||||||
|
new String[] { "this", "this=BLAH=is", "is", "is=BLAH=a",
|
||||||
|
"a", "a=BLAH=test", "test" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test with a non-default token separator and with unigrams disabled
|
||||||
|
*/
|
||||||
|
public void testTokenSeparatorNoUnigrams() throws Exception {
|
||||||
|
Reader reader = new StringReader("this is a test");
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("tokenSeparator", "=BLAH=");
|
||||||
|
args.put("outputUnigrams", "false");
|
||||||
|
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||||
|
factory.init(args);
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream,
|
||||||
|
new String[] { "this=BLAH=is", "is=BLAH=a", "a=BLAH=test" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test with an empty token separator
|
||||||
|
*/
|
||||||
|
public void testEmptyTokenSeparator() throws Exception {
|
||||||
|
Reader reader = new StringReader("this is a test");
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("tokenSeparator", "");
|
||||||
|
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||||
|
factory.init(args);
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream,
|
||||||
|
new String[] { "this", "thisis", "is", "isa", "a", "atest", "test" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test with higher min (and max) shingle size
|
||||||
|
* and with a non-default token separator
|
||||||
|
*/
|
||||||
|
public void testMinShingleSizeAndTokenSeparator() throws Exception {
|
||||||
|
Reader reader = new StringReader("this is a test");
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("minShingleSize", "3");
|
||||||
|
args.put("maxShingleSize", "4");
|
||||||
|
args.put("tokenSeparator", "=BLAH=");
|
||||||
|
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||||
|
factory.init(args);
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream,
|
||||||
|
new String[] { "this", "this=BLAH=is=BLAH=a",
|
||||||
|
"this=BLAH=is=BLAH=a=BLAH=test", "is",
|
||||||
|
"is=BLAH=a=BLAH=test", "a", "test" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test with higher min (and max) shingle size
|
||||||
|
* and with a non-default token separator
|
||||||
|
* and with unigrams disabled
|
||||||
|
*/
|
||||||
|
public void testMinShingleSizeAndTokenSeparatorNoUnigrams() throws Exception {
|
||||||
|
Reader reader = new StringReader("this is a test");
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("minShingleSize", "3");
|
||||||
|
args.put("maxShingleSize", "4");
|
||||||
|
args.put("tokenSeparator", "=BLAH=");
|
||||||
|
args.put("outputUnigrams", "false");
|
||||||
|
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||||
|
factory.init(args);
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream,
|
||||||
|
new String[] { "this=BLAH=is=BLAH=a", "this=BLAH=is=BLAH=a=BLAH=test",
|
||||||
|
"is=BLAH=a=BLAH=test", });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue