SOLR-1377: The TokenizerFactory API has changed to explicitly return a Tokenizer

rather then a TokenStream (that may be or may not be a Tokenizer).  This change 
is required to take advantage of the Token reuse improvements in lucene 2.9.

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@807338 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Ryan McKinley 2009-08-24 18:58:22 +00:00
parent 5d1bb05f32
commit f3b3a29360
8 changed files with 88 additions and 41 deletions

View File

@ -42,6 +42,12 @@ requests. (The simplest way to do this is by specifying it as a default param
for your request handlers in solrconfig.xml, see the example solrconfig.xml for for your request handlers in solrconfig.xml, see the example solrconfig.xml for
sample syntax.) sample syntax.)
The TokenizerFactory API has changed to explicitly return a Tokenizer rather then
a TokenStream (that may be or may not be a Tokenizer). This change is required
to take advantage of the Token reuse improvements in lucene 2.9. For more
information, see SOLR-1377.
Versions of Major Components Versions of Major Components
---------------------------- ----------------------------
Apache Lucene 2.9-dev r804692 Apache Lucene 2.9-dev r804692
@ -615,6 +621,10 @@ Other Changes
45. SOLR1276: Added StatsComponentTest (Rafał Kuć, gsingers) 45. SOLR1276: Added StatsComponentTest (Rafał Kuć, gsingers)
46. SOLR-1377: The TokenizerFactory API has changed to explicitly return a Tokenizer
rather then a TokenStream (that may be or may not be a Tokenizer). This change
is required to take advantage of the Token reuse improvements in lucene 2.9. (ryan)
Build Build
---------------------- ----------------------

View File

@ -18,9 +18,11 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.Reader; import java.io.Reader;
import java.io.IOException;
/** /**
* @version $Id$ * @version $Id$
@ -28,7 +30,12 @@ import java.io.Reader;
*/ */
@Deprecated @Deprecated
public class HTMLStripStandardTokenizerFactory extends BaseTokenizerFactory { public class HTMLStripStandardTokenizerFactory extends BaseTokenizerFactory {
public TokenStream create(Reader input) { public Tokenizer create(Reader input) {
return new StandardTokenizer(new HTMLStripReader(input)); return new StandardTokenizer(new HTMLStripReader(input)) {
@Override
public void reset(Reader reader) throws IOException {
super.reset(new HTMLStripReader(reader));
}
};
} }
} }

View File

@ -18,9 +18,11 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer;
import java.io.Reader; import java.io.Reader;
import java.io.IOException;
/** /**
* @version $Id$ * @version $Id$
@ -28,7 +30,12 @@ import java.io.Reader;
*/ */
@Deprecated @Deprecated
public class HTMLStripWhitespaceTokenizerFactory extends BaseTokenizerFactory { public class HTMLStripWhitespaceTokenizerFactory extends BaseTokenizerFactory {
public TokenStream create(Reader input) { public Tokenizer create(Reader input) {
return new WhitespaceTokenizer(new HTMLStripReader(input)); return new WhitespaceTokenizer(new HTMLStripReader(input)) {
@Override
public void reset(Reader input) throws IOException {
super.reset(new HTMLStripReader(input));
}
};
} }
} }

View File

@ -17,16 +17,6 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.solr.common.SolrException;
import org.apache.solr.core.SolrConfig;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.ArrayList; import java.util.ArrayList;
@ -36,6 +26,11 @@ import java.util.Map;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.solr.common.SolrException;
/** /**
* This tokenizer uses regex pattern matching to construct distinct tokens * This tokenizer uses regex pattern matching to construct distinct tokens
@ -103,41 +98,44 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
/** /**
* Split the input using configured pattern * Split the input using configured pattern
*/ */
public TokenStream create(Reader input) { public Tokenizer create(final Reader in) {
try { try {
// Read the input into a single string return new Tokenizer(in) {
String str = IOUtils.toString( input ); {init();}
Matcher matcher = pattern.matcher( str ); List<Token> tokens;
List<Token> tokens = (group < 0 ) Iterator<Token> iter;
? split( matcher, str )
: group( matcher, str, group ); void init() throws IOException {
// Read the input into a single string
final Iterator<Token> iter = tokens.iterator(); String str = IOUtils.toString( input );
return new TokenStream() {
@Override Matcher matcher = pattern.matcher( str );
public boolean incrementToken() throws IOException { tokens = (group < 0 )
return super.incrementToken(); ? split( matcher, str )
: group( matcher, str, group );
iter = tokens.iterator();
} }
// @Override
// public boolean incrementToken() throws IOException {
// return super.incrementToken();
// }
@Override @Override
public void end() throws IOException { public void end() throws IOException {
super.end(); super.end();
} }
@Override // @Override
public Token next(Token reusableToken) throws IOException { // public Token next(Token reusableToken) throws IOException {
return super.next(reusableToken); // return super.next(reusableToken);
} // }
@Override @Override
public void reset() throws IOException { public void reset(Reader input) throws IOException {
super.reset(); super.reset(input);
} init();
@Override
public void close() throws IOException {
super.close();
} }
@Override @Override

View File

@ -65,6 +65,6 @@ public interface TokenizerFactory {
public Map<String,String> getArgs(); public Map<String,String> getArgs();
/** Creates a TokenStream of the specified input */ /** Creates a TokenStream of the specified input */
public TokenStream create(Reader input); public Tokenizer create(Reader input);
} }

View File

@ -47,7 +47,7 @@ public class TrieTokenizerFactory extends BaseTokenizerFactory {
this.precisionStep = precisionStep; this.precisionStep = precisionStep;
} }
public TokenStream create(Reader input) { public Tokenizer create(Reader input) {
return new TrieTokenizer(input, type, precisionStep, TrieTokenizer.getNumericTokenStream(precisionStep)); return new TrieTokenizer(input, type, precisionStep, TrieTokenizer.getNumericTokenStream(precisionStep));
} }
} }

View File

@ -466,6 +466,25 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
} }
public void testTokenizer() {
assertU(adoc("id", "4055",
"patterntok", "Hello,There"));
assertU(adoc("id", "4056",
"patterntok", "Goodbye,Now"));
assertU(commit());
assertQ("make sure it split ok",
req("patterntok:Hello")
,"*[count(//doc)=1]"
);
assertQ("make sure it split ok",
req("patterntok:Goodbye")
,"*[count(//doc)=1]"
);
}
public void testConfigDefaults() { public void testConfigDefaults() {
assertU(adoc("id", "42", assertU(adoc("id", "42",
"name", "Zapp Brannigan")); "name", "Zapp Brannigan"));

View File

@ -205,6 +205,11 @@
<tokenizer class="solr.KeywordTokenizerFactory"/> <tokenizer class="solr.KeywordTokenizerFactory"/>
</analyzer> </analyzer>
</fieldtype> </fieldtype>
<fieldtype name="patterntok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.PatternTokenizerFactory" pattern=","/>
</analyzer>
</fieldtype>
<fieldtype name="porterfilt" class="solr.TextField"> <fieldtype name="porterfilt" class="solr.TextField">
<analyzer> <analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/> <tokenizer class="solr.WhitespaceTokenizerFactory"/>
@ -422,6 +427,7 @@
<field name="standardtokfilt" type="standardtokfilt" indexed="true" stored="true"/> <field name="standardtokfilt" type="standardtokfilt" indexed="true" stored="true"/>
<field name="standardfilt" type="standardfilt" indexed="true" stored="true"/> <field name="standardfilt" type="standardfilt" indexed="true" stored="true"/>
<field name="lowerfilt" type="lowerfilt" indexed="true" stored="true"/> <field name="lowerfilt" type="lowerfilt" indexed="true" stored="true"/>
<field name="patterntok" type="patterntok" indexed="true" stored="true"/>
<field name="patternreplacefilt" type="patternreplacefilt" indexed="true" stored="true"/> <field name="patternreplacefilt" type="patternreplacefilt" indexed="true" stored="true"/>
<field name="porterfilt" type="porterfilt" indexed="true" stored="true"/> <field name="porterfilt" type="porterfilt" indexed="true" stored="true"/>
<field name="engporterfilt" type="engporterfilt" indexed="true" stored="true"/> <field name="engporterfilt" type="engporterfilt" indexed="true" stored="true"/>