From 47416b3948786e3dea7acea28e799ce26f14fc2f Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 2 Apr 2010 05:00:53 +0000 Subject: [PATCH] SOLR-1740: ShingleFilterFactory improvements git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@930163 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 4 + .../solr/analysis/ShingleFilterFactory.java | 30 +++- .../analysis/TestShingleFilterFactory.java | 146 ++++++++++++++++++ 3 files changed, 179 insertions(+), 1 deletion(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index f30e87e2cd0..4eb4f6d72db 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -135,6 +135,10 @@ New Features TokenFilters now support custom Attributes, and some have improved performance: especially WordDelimiterFilter and CommonGramsFilter. (rmuir, cmale, uschindler) +* SOLR-1740: ShingleFilterFactory supports the "minShingleSize" and "tokenSeparator" + parameters for controlling the minimum shingle size produced by the filter, and + the separator string that it uses, respectively. (Steven Rowe via rmuir) + Optimizations ---------------------- diff --git a/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java b/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java index 9ebff5a8d12..bf8bdb23f1d 100644 --- a/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java @@ -21,21 +21,49 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrException.ErrorCode; + import java.util.Map; /** Factory for {@link ShingleFilter} */ public class ShingleFilterFactory extends BaseTokenFilterFactory { + private int minShingleSize; private int maxShingleSize; private boolean outputUnigrams; + private String tokenSeparator; + public void init(Map args) { super.init(args); maxShingleSize = getInt("maxShingleSize", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); + if (maxShingleSize < 2) { + throw new SolrException(ErrorCode.SERVER_ERROR, + "Invalid maxShingleSize (" + maxShingleSize + + ") - must be at least 2"); + } + minShingleSize = getInt("minShingleSize", + ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE); + if (minShingleSize < 2) { + throw new SolrException(ErrorCode.SERVER_ERROR, + "Invalid minShingleSize (" + minShingleSize + + ") - must be at least 2"); + } + if (minShingleSize > maxShingleSize) { + throw new SolrException(ErrorCode.SERVER_ERROR, + "Invalid minShingleSize (" + minShingleSize + + ") - must be no greater than maxShingleSize (" + + maxShingleSize + ")"); + } outputUnigrams = getBoolean("outputUnigrams", true); + tokenSeparator = args.containsKey("tokenSeparator") + ? args.get("tokenSeparator") + : ShingleFilter.TOKEN_SEPARATOR; } public ShingleFilter create(TokenStream input) { - ShingleFilter r = new ShingleFilter(input,maxShingleSize); + ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize); r.setOutputUnigrams(outputUnigrams); + r.setTokenSeparator(tokenSeparator); return r; } } diff --git a/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java index ede684971ac..4e8248cf376 100644 --- a/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java @@ -70,4 +70,150 @@ public class TestShingleFilterFactory extends BaseTokenTestCase { new String[] {"this", "this is", "this is a", "is", "is a", "is a test", "a", "a test", "test"}); } + + /** + * Test with higher min (and max) shingle size + */ + public void testMinShingleSize() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("minShingleSize", "3"); + args.put("maxShingleSize", "4"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, + new String[] { "this", "this is a", "this is a test", + "is", "is a test", "a", "test" }); + } + + /** + * Test with higher min (and max) shingle size and with unigrams disabled + */ + public void testMinShingleSizeNoUnigrams() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("minShingleSize", "3"); + args.put("maxShingleSize", "4"); + args.put("outputUnigrams", "false"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, + new String[] { "this is a", "this is a test", "is a test" }); + } + + /** + * Test with higher same min and max shingle size + */ + public void testEqualMinAndMaxShingleSize() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("minShingleSize", "3"); + args.put("maxShingleSize", "3"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, + new String[] { "this", "this is a", "is", "is a test", "a", "test" }); + } + + /** + * Test with higher same min and max shingle size and with unigrams disabled + */ + public void testEqualMinAndMaxShingleSizeNoUnigrams() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("minShingleSize", "3"); + args.put("maxShingleSize", "3"); + args.put("outputUnigrams", "false"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, + new String[] { "this is a", "is a test" }); + } + + /** + * Test with a non-default token separator + */ + public void testTokenSeparator() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("tokenSeparator", "=BLAH="); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, + new String[] { "this", "this=BLAH=is", "is", "is=BLAH=a", + "a", "a=BLAH=test", "test" }); + } + + /** + * Test with a non-default token separator and with unigrams disabled + */ + public void testTokenSeparatorNoUnigrams() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("tokenSeparator", "=BLAH="); + args.put("outputUnigrams", "false"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, + new String[] { "this=BLAH=is", "is=BLAH=a", "a=BLAH=test" }); + } + + /** + * Test with an empty token separator + */ + public void testEmptyTokenSeparator() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("tokenSeparator", ""); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, + new String[] { "this", "thisis", "is", "isa", "a", "atest", "test" }); + } + + /** + * Test with higher min (and max) shingle size + * and with a non-default token separator + */ + public void testMinShingleSizeAndTokenSeparator() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("minShingleSize", "3"); + args.put("maxShingleSize", "4"); + args.put("tokenSeparator", "=BLAH="); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, + new String[] { "this", "this=BLAH=is=BLAH=a", + "this=BLAH=is=BLAH=a=BLAH=test", "is", + "is=BLAH=a=BLAH=test", "a", "test" }); + } + + /** + * Test with higher min (and max) shingle size + * and with a non-default token separator + * and with unigrams disabled + */ + public void testMinShingleSizeAndTokenSeparatorNoUnigrams() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("minShingleSize", "3"); + args.put("maxShingleSize", "4"); + args.put("tokenSeparator", "=BLAH="); + args.put("outputUnigrams", "false"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, + new String[] { "this=BLAH=is=BLAH=a", "this=BLAH=is=BLAH=a=BLAH=test", + "is=BLAH=a=BLAH=test", }); + } }