From f3b3a29360204e1efda4ae7ad857e4c8313105fb Mon Sep 17 00:00:00 2001 From: Ryan McKinley Date: Mon, 24 Aug 2009 18:58:22 +0000 Subject: [PATCH] SOLR-1377: The TokenizerFactory API has changed to explicitly return a Tokenizer rather then a TokenStream (that may be or may not be a Tokenizer). This change is required to take advantage of the Token reuse improvements in lucene 2.9. git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@807338 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 10 +++ .../HTMLStripStandardTokenizerFactory.java | 11 ++- .../HTMLStripWhitespaceTokenizerFactory.java | 11 ++- .../analysis/PatternTokenizerFactory.java | 68 +++++++++---------- .../solr/analysis/TokenizerFactory.java | 2 +- .../solr/analysis/TrieTokenizerFactory.java | 2 +- .../apache/solr/BasicFunctionalityTest.java | 19 ++++++ src/test/test-files/solr/conf/schema.xml | 6 ++ 8 files changed, 88 insertions(+), 41 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 3c138abb1f4..e3bae4aa3c2 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -42,6 +42,12 @@ requests. (The simplest way to do this is by specifying it as a default param for your request handlers in solrconfig.xml, see the example solrconfig.xml for sample syntax.) +The TokenizerFactory API has changed to explicitly return a Tokenizer rather then +a TokenStream (that may be or may not be a Tokenizer). This change is required +to take advantage of the Token reuse improvements in lucene 2.9. For more +information, see SOLR-1377. + + Versions of Major Components ---------------------------- Apache Lucene 2.9-dev r804692 @@ -615,6 +621,10 @@ Other Changes 45. SOLR1276: Added StatsComponentTest (Rafał Kuć, gsingers) +46. SOLR-1377: The TokenizerFactory API has changed to explicitly return a Tokenizer + rather then a TokenStream (that may be or may not be a Tokenizer). This change + is required to take advantage of the Token reuse improvements in lucene 2.9. (ryan) + Build ---------------------- diff --git a/src/java/org/apache/solr/analysis/HTMLStripStandardTokenizerFactory.java b/src/java/org/apache/solr/analysis/HTMLStripStandardTokenizerFactory.java index 014fcb46398..154beb1c4d6 100644 --- a/src/java/org/apache/solr/analysis/HTMLStripStandardTokenizerFactory.java +++ b/src/java/org/apache/solr/analysis/HTMLStripStandardTokenizerFactory.java @@ -18,9 +18,11 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer; import java.io.Reader; +import java.io.IOException; /** * @version $Id$ @@ -28,7 +30,12 @@ import java.io.Reader; */ @Deprecated public class HTMLStripStandardTokenizerFactory extends BaseTokenizerFactory { - public TokenStream create(Reader input) { - return new StandardTokenizer(new HTMLStripReader(input)); + public Tokenizer create(Reader input) { + return new StandardTokenizer(new HTMLStripReader(input)) { + @Override + public void reset(Reader reader) throws IOException { + super.reset(new HTMLStripReader(reader)); + } + }; } } diff --git a/src/java/org/apache/solr/analysis/HTMLStripWhitespaceTokenizerFactory.java b/src/java/org/apache/solr/analysis/HTMLStripWhitespaceTokenizerFactory.java index a38b5ef7e34..a2ec9be86ff 100644 --- a/src/java/org/apache/solr/analysis/HTMLStripWhitespaceTokenizerFactory.java +++ b/src/java/org/apache/solr/analysis/HTMLStripWhitespaceTokenizerFactory.java @@ -18,9 +18,11 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer; import java.io.Reader; +import java.io.IOException; /** * @version $Id$ @@ -28,7 +30,12 @@ import java.io.Reader; */ @Deprecated public class HTMLStripWhitespaceTokenizerFactory extends BaseTokenizerFactory { - public TokenStream create(Reader input) { - return new WhitespaceTokenizer(new HTMLStripReader(input)); + public Tokenizer create(Reader input) { + return new WhitespaceTokenizer(new HTMLStripReader(input)) { + @Override + public void reset(Reader input) throws IOException { + super.reset(new HTMLStripReader(input)); + } + }; } } diff --git a/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java b/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java index f9faed31a9e..28c6b56d533 100644 --- a/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java +++ b/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java @@ -17,16 +17,6 @@ package org.apache.solr.analysis; -import org.apache.commons.io.IOUtils; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.CharStream; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.solr.common.SolrException; -import org.apache.solr.core.SolrConfig; - import java.io.IOException; import java.io.Reader; import java.util.ArrayList; @@ -36,6 +26,11 @@ import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.commons.io.IOUtils; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.solr.common.SolrException; + /** * This tokenizer uses regex pattern matching to construct distinct tokens @@ -103,41 +98,44 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory /** * Split the input using configured pattern */ - public TokenStream create(Reader input) { + public Tokenizer create(final Reader in) { try { - // Read the input into a single string - String str = IOUtils.toString( input ); - - Matcher matcher = pattern.matcher( str ); - List tokens = (group < 0 ) - ? split( matcher, str ) - : group( matcher, str, group ); - - final Iterator iter = tokens.iterator(); - return new TokenStream() { - @Override - public boolean incrementToken() throws IOException { - return super.incrementToken(); + return new Tokenizer(in) { + {init();} + + List tokens; + Iterator iter; + + void init() throws IOException { + // Read the input into a single string + String str = IOUtils.toString( input ); + + Matcher matcher = pattern.matcher( str ); + tokens = (group < 0 ) + ? split( matcher, str ) + : group( matcher, str, group ); + iter = tokens.iterator(); } +// @Override +// public boolean incrementToken() throws IOException { +// return super.incrementToken(); +// } + @Override public void end() throws IOException { super.end(); } - @Override - public Token next(Token reusableToken) throws IOException { - return super.next(reusableToken); - } +// @Override +// public Token next(Token reusableToken) throws IOException { +// return super.next(reusableToken); +// } @Override - public void reset() throws IOException { - super.reset(); - } - - @Override - public void close() throws IOException { - super.close(); + public void reset(Reader input) throws IOException { + super.reset(input); + init(); } @Override diff --git a/src/java/org/apache/solr/analysis/TokenizerFactory.java b/src/java/org/apache/solr/analysis/TokenizerFactory.java index 48cc9c7ac90..f9ef4152f7e 100644 --- a/src/java/org/apache/solr/analysis/TokenizerFactory.java +++ b/src/java/org/apache/solr/analysis/TokenizerFactory.java @@ -65,6 +65,6 @@ public interface TokenizerFactory { public Map getArgs(); /** Creates a TokenStream of the specified input */ - public TokenStream create(Reader input); + public Tokenizer create(Reader input); } diff --git a/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java b/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java index 99dd7992fff..e975a4876a6 100644 --- a/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java +++ b/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java @@ -47,7 +47,7 @@ public class TrieTokenizerFactory extends BaseTokenizerFactory { this.precisionStep = precisionStep; } - public TokenStream create(Reader input) { + public Tokenizer create(Reader input) { return new TrieTokenizer(input, type, precisionStep, TrieTokenizer.getNumericTokenStream(precisionStep)); } } diff --git a/src/test/org/apache/solr/BasicFunctionalityTest.java b/src/test/org/apache/solr/BasicFunctionalityTest.java index ce45cd23b52..c1a0317b081 100644 --- a/src/test/org/apache/solr/BasicFunctionalityTest.java +++ b/src/test/org/apache/solr/BasicFunctionalityTest.java @@ -466,6 +466,25 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase { } + + public void testTokenizer() { + + assertU(adoc("id", "4055", + "patterntok", "Hello,There")); + assertU(adoc("id", "4056", + "patterntok", "Goodbye,Now")); + assertU(commit()); + + assertQ("make sure it split ok", + req("patterntok:Hello") + ,"*[count(//doc)=1]" + ); + assertQ("make sure it split ok", + req("patterntok:Goodbye") + ,"*[count(//doc)=1]" + ); + } + public void testConfigDefaults() { assertU(adoc("id", "42", "name", "Zapp Brannigan")); diff --git a/src/test/test-files/solr/conf/schema.xml b/src/test/test-files/solr/conf/schema.xml index 6db63261ea1..1d438c4b684 100644 --- a/src/test/test-files/solr/conf/schema.xml +++ b/src/test/test-files/solr/conf/schema.xml @@ -205,6 +205,11 @@ + + + + + @@ -422,6 +427,7 @@ +