From 33c9d97119e4035657966b86d4b74bd03534e9aa Mon Sep 17 00:00:00 2001 From: Steven Rowe Date: Wed, 15 Dec 2010 20:24:26 +0000 Subject: [PATCH] SOLR-2188: provide maxTokenLength arg for Classic, Standard, and UAX29URLEmail tokenizer factories git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1049693 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 2 + .../analysis/ClassicTokenizerFactory.java | 11 +++++- .../analysis/StandardTokenizerFactory.java | 11 +++++- .../UAX29URLEmailTokenizerFactory.java | 10 ++++- .../solr/analysis/TestStandardFactories.java | 38 +++++++++++++++++++ .../TestUAX29URLEmailTokenizerFactory.java | 21 ++++++++++ 6 files changed, 90 insertions(+), 3 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 8348ab8a163..fde689e9b99 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -314,6 +314,8 @@ New Features Adding a parameter NOW= to the request will override the current time. (Peter Sturge, yonik) +* SOLR-2188: Added "maxTokenLength" argument to the factories for ClassicTokenizer, + StandardTokenizer, and UAX29URLEmailTokenizer. (Steven Rowe) Optimizations ---------------------- diff --git a/solr/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java index 5fa929b8a88..7bf4b3050fb 100644 --- a/solr/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java @@ -19,6 +19,8 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.standard.ClassicTokenizer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.standard.StandardTokenizer; import java.io.Reader; import java.util.Map; @@ -28,13 +30,20 @@ import java.util.Map; */ public class ClassicTokenizerFactory extends BaseTokenizerFactory { + + private int maxTokenLength; + @Override public void init(Map args) { super.init(args); assureMatchVersion(); + maxTokenLength = getInt("maxTokenLength", + StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); } public Tokenizer create(Reader input) { - return new ClassicTokenizer(luceneMatchVersion, input); + ClassicTokenizer tokenizer = new ClassicTokenizer(luceneMatchVersion, input); + tokenizer.setMaxTokenLength(maxTokenLength); + return tokenizer; } } diff --git a/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java index e7e27eed9dd..f1d09cac5d0 100644 --- a/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java @@ -17,6 +17,7 @@ package org.apache.solr.analysis; +import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardTokenizer; import java.io.Reader; @@ -27,13 +28,21 @@ import java.util.Map; */ public class StandardTokenizerFactory extends BaseTokenizerFactory { + + private int maxTokenLength; + @Override public void init(Map args) { super.init(args); assureMatchVersion(); + maxTokenLength = getInt("maxTokenLength", + StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); } public StandardTokenizer create(Reader input) { - return new StandardTokenizer(luceneMatchVersion, input); + StandardTokenizer tokenizer + = new StandardTokenizer(luceneMatchVersion, input); + tokenizer.setMaxTokenLength(maxTokenLength); + return tokenizer; } } diff --git a/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java index 5b99c8d2a56..bdffd9442ea 100644 --- a/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java @@ -20,6 +20,7 @@ package org.apache.solr.analysis; +import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer; import java.io.Reader; @@ -31,13 +32,20 @@ import java.util.Map; */ public class UAX29URLEmailTokenizerFactory extends BaseTokenizerFactory { + + private int maxTokenLength; + @Override public void init(Map args) { super.init(args); assureMatchVersion(); + maxTokenLength = getInt("maxTokenLength", + StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); } public UAX29URLEmailTokenizer create(Reader input) { - return new UAX29URLEmailTokenizer(input); + UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(input); + tokenizer.setMaxTokenLength(maxTokenLength); + return tokenizer; } } diff --git a/solr/src/test/org/apache/solr/analysis/TestStandardFactories.java b/solr/src/test/org/apache/solr/analysis/TestStandardFactories.java index ef2bd9972c4..267a3643b41 100644 --- a/solr/src/test/org/apache/solr/analysis/TestStandardFactories.java +++ b/solr/src/test/org/apache/solr/analysis/TestStandardFactories.java @@ -19,6 +19,8 @@ package org.apache.solr.analysis; import java.io.Reader; import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -40,6 +42,24 @@ public class TestStandardFactories extends BaseTokenTestCase { new String[] {"Wha\u0301t's", "this", "thing", "do" }); } + public void testStandardTokenizerMaxTokenLength() throws Exception { + StringBuilder builder = new StringBuilder(); + for (int i = 0 ; i < 100 ; ++i) { + builder.append("abcdefg"); // 7 * 100 = 700 char "word" + } + String longWord = builder.toString(); + String content = "one two three " + longWord + " four five six"; + Reader reader = new StringReader(content); + Map args = new HashMap(); + args.put("luceneMatchVersion", DEFAULT_VERSION_PARAM.get("luceneMatchVersion")); + args.put("maxTokenLength", "1000"); + StandardTokenizerFactory factory = new StandardTokenizerFactory(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"one", "two", "three", longWord, "four", "five", "six" }); + } + /** * Test ClassicTokenizerFactory */ @@ -52,6 +72,24 @@ public class TestStandardFactories extends BaseTokenTestCase { new String[] {"What's", "this", "thing", "do" }); } + public void testClassicTokenizerMaxTokenLength() throws Exception { + StringBuilder builder = new StringBuilder(); + for (int i = 0 ; i < 100 ; ++i) { + builder.append("abcdefg"); // 7 * 100 = 700 char "word" + } + String longWord = builder.toString(); + String content = "one two three " + longWord + " four five six"; + Reader reader = new StringReader(content); + Map args = new HashMap(); + args.put("luceneMatchVersion", DEFAULT_VERSION_PARAM.get("luceneMatchVersion")); + args.put("maxTokenLength", "1000"); + ClassicTokenizerFactory factory = new ClassicTokenizerFactory(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"one", "two", "three", longWord, "four", "five", "six" }); + } + /** * Test ClassicFilterFactory */ diff --git a/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java b/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java index f3982d400ca..558778d2cde 100644 --- a/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java @@ -19,6 +19,9 @@ package org.apache.solr.analysis; import java.io.Reader; import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + import org.apache.lucene.analysis.Tokenizer; /** @@ -152,4 +155,22 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenTestCase { } ); } + + public void testMaxTokenLength() throws Exception { + StringBuilder builder = new StringBuilder(); + for (int i = 0 ; i < 100 ; ++i) { + builder.append("abcdefg"); // 7 * 100 = 700 char "word" + } + String longWord = builder.toString(); + String content = "one two three " + longWord + " four five six"; + Reader reader = new StringReader(content); + Map args = new HashMap(); + args.put("luceneMatchVersion", DEFAULT_VERSION_PARAM.get("luceneMatchVersion")); + args.put("maxTokenLength", "1000"); + UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"one", "two", "three", longWord, "four", "five", "six" }); + } }