SOLR-2188: provide maxTokenLength arg for Classic, Standard, and UAX29URLEmail tokenizer factories

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1049693 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2010-12-15 20:24:26 +00:00
parent 60f0a9be8a
commit 33c9d97119
6 changed files with 90 additions and 3 deletions

View File

@ -314,6 +314,8 @@ New Features
Adding a parameter NOW=<time_in_ms> to the request will override the
current time. (Peter Sturge, yonik)
* SOLR-2188: Added "maxTokenLength" argument to the factories for ClassicTokenizer,
StandardTokenizer, and UAX29URLEmailTokenizer. (Steven Rowe)
Optimizations
----------------------

View File

@ -19,6 +19,8 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.Reader;
import java.util.Map;
@ -28,13 +30,20 @@ import java.util.Map;
*/
public class ClassicTokenizerFactory extends BaseTokenizerFactory {
private int maxTokenLength;
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
maxTokenLength = getInt("maxTokenLength",
StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
}
public Tokenizer create(Reader input) {
return new ClassicTokenizer(luceneMatchVersion, input);
ClassicTokenizer tokenizer = new ClassicTokenizer(luceneMatchVersion, input);
tokenizer.setMaxTokenLength(maxTokenLength);
return tokenizer;
}
}

View File

@ -17,6 +17,7 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.Reader;
@ -27,13 +28,21 @@ import java.util.Map;
*/
public class StandardTokenizerFactory extends BaseTokenizerFactory {
private int maxTokenLength;
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
maxTokenLength = getInt("maxTokenLength",
StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
}
public StandardTokenizer create(Reader input) {
return new StandardTokenizer(luceneMatchVersion, input);
StandardTokenizer tokenizer
= new StandardTokenizer(luceneMatchVersion, input);
tokenizer.setMaxTokenLength(maxTokenLength);
return tokenizer;
}
}

View File

@ -20,6 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import java.io.Reader;
@ -31,13 +32,20 @@ import java.util.Map;
*/
public class UAX29URLEmailTokenizerFactory extends BaseTokenizerFactory {
private int maxTokenLength;
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
maxTokenLength = getInt("maxTokenLength",
StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
}
public UAX29URLEmailTokenizer create(Reader input) {
return new UAX29URLEmailTokenizer(input);
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(input);
tokenizer.setMaxTokenLength(maxTokenLength);
return tokenizer;
}
}

View File

@ -19,6 +19,8 @@ package org.apache.solr.analysis;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@ -40,6 +42,24 @@ public class TestStandardFactories extends BaseTokenTestCase {
new String[] {"Wha\u0301t's", "this", "thing", "do" });
}
public void testStandardTokenizerMaxTokenLength() throws Exception {
StringBuilder builder = new StringBuilder();
for (int i = 0 ; i < 100 ; ++i) {
builder.append("abcdefg"); // 7 * 100 = 700 char "word"
}
String longWord = builder.toString();
String content = "one two three " + longWord + " four five six";
Reader reader = new StringReader(content);
Map<String,String> args = new HashMap<String,String>();
args.put("luceneMatchVersion", DEFAULT_VERSION_PARAM.get("luceneMatchVersion"));
args.put("maxTokenLength", "1000");
StandardTokenizerFactory factory = new StandardTokenizerFactory();
factory.init(args);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"one", "two", "three", longWord, "four", "five", "six" });
}
/**
* Test ClassicTokenizerFactory
*/
@ -52,6 +72,24 @@ public class TestStandardFactories extends BaseTokenTestCase {
new String[] {"What's", "this", "thing", "do" });
}
public void testClassicTokenizerMaxTokenLength() throws Exception {
StringBuilder builder = new StringBuilder();
for (int i = 0 ; i < 100 ; ++i) {
builder.append("abcdefg"); // 7 * 100 = 700 char "word"
}
String longWord = builder.toString();
String content = "one two three " + longWord + " four five six";
Reader reader = new StringReader(content);
Map<String,String> args = new HashMap<String,String>();
args.put("luceneMatchVersion", DEFAULT_VERSION_PARAM.get("luceneMatchVersion"));
args.put("maxTokenLength", "1000");
ClassicTokenizerFactory factory = new ClassicTokenizerFactory();
factory.init(args);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"one", "two", "three", longWord, "four", "five", "six" });
}
/**
* Test ClassicFilterFactory
*/

View File

@ -19,6 +19,9 @@ package org.apache.solr.analysis;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Tokenizer;
/**
@ -152,4 +155,22 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenTestCase {
}
);
}
public void testMaxTokenLength() throws Exception {
StringBuilder builder = new StringBuilder();
for (int i = 0 ; i < 100 ; ++i) {
builder.append("abcdefg"); // 7 * 100 = 700 char "word"
}
String longWord = builder.toString();
String content = "one two three " + longWord + " four five six";
Reader reader = new StringReader(content);
Map<String,String> args = new HashMap<String,String>();
args.put("luceneMatchVersion", DEFAULT_VERSION_PARAM.get("luceneMatchVersion"));
args.put("maxTokenLength", "1000");
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
factory.init(args);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"one", "two", "three", longWord, "four", "five", "six" });
}
}