mirror of https://github.com/apache/lucene.git
SOLR-1377: The TokenizerFactory API has changed to explicitly return a Tokenizer
rather then a TokenStream (that may be or may not be a Tokenizer). This change is required to take advantage of the Token reuse improvements in lucene 2.9. git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@807338 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5d1bb05f32
commit
f3b3a29360
10
CHANGES.txt
10
CHANGES.txt
|
@ -42,6 +42,12 @@ requests. (The simplest way to do this is by specifying it as a default param
|
||||||
for your request handlers in solrconfig.xml, see the example solrconfig.xml for
|
for your request handlers in solrconfig.xml, see the example solrconfig.xml for
|
||||||
sample syntax.)
|
sample syntax.)
|
||||||
|
|
||||||
|
The TokenizerFactory API has changed to explicitly return a Tokenizer rather then
|
||||||
|
a TokenStream (that may be or may not be a Tokenizer). This change is required
|
||||||
|
to take advantage of the Token reuse improvements in lucene 2.9. For more
|
||||||
|
information, see SOLR-1377.
|
||||||
|
|
||||||
|
|
||||||
Versions of Major Components
|
Versions of Major Components
|
||||||
----------------------------
|
----------------------------
|
||||||
Apache Lucene 2.9-dev r804692
|
Apache Lucene 2.9-dev r804692
|
||||||
|
@ -615,6 +621,10 @@ Other Changes
|
||||||
|
|
||||||
45. SOLR1276: Added StatsComponentTest (Rafał Kuć, gsingers)
|
45. SOLR1276: Added StatsComponentTest (Rafał Kuć, gsingers)
|
||||||
|
|
||||||
|
46. SOLR-1377: The TokenizerFactory API has changed to explicitly return a Tokenizer
|
||||||
|
rather then a TokenStream (that may be or may not be a Tokenizer). This change
|
||||||
|
is required to take advantage of the Token reuse improvements in lucene 2.9. (ryan)
|
||||||
|
|
||||||
|
|
||||||
Build
|
Build
|
||||||
----------------------
|
----------------------
|
||||||
|
|
|
@ -18,9 +18,11 @@
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
|
@ -28,7 +30,12 @@ import java.io.Reader;
|
||||||
*/
|
*/
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public class HTMLStripStandardTokenizerFactory extends BaseTokenizerFactory {
|
public class HTMLStripStandardTokenizerFactory extends BaseTokenizerFactory {
|
||||||
public TokenStream create(Reader input) {
|
public Tokenizer create(Reader input) {
|
||||||
return new StandardTokenizer(new HTMLStripReader(input));
|
return new StandardTokenizer(new HTMLStripReader(input)) {
|
||||||
|
@Override
|
||||||
|
public void reset(Reader reader) throws IOException {
|
||||||
|
super.reset(new HTMLStripReader(reader));
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,9 +18,11 @@
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
|
@ -28,7 +30,12 @@ import java.io.Reader;
|
||||||
*/
|
*/
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public class HTMLStripWhitespaceTokenizerFactory extends BaseTokenizerFactory {
|
public class HTMLStripWhitespaceTokenizerFactory extends BaseTokenizerFactory {
|
||||||
public TokenStream create(Reader input) {
|
public Tokenizer create(Reader input) {
|
||||||
return new WhitespaceTokenizer(new HTMLStripReader(input));
|
return new WhitespaceTokenizer(new HTMLStripReader(input)) {
|
||||||
|
@Override
|
||||||
|
public void reset(Reader input) throws IOException {
|
||||||
|
super.reset(new HTMLStripReader(input));
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,16 +17,6 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
|
||||||
import org.apache.lucene.analysis.CharStream;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
|
||||||
import org.apache.solr.common.SolrException;
|
|
||||||
import org.apache.solr.core.SolrConfig;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -36,6 +26,11 @@ import java.util.Map;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.solr.common.SolrException;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This tokenizer uses regex pattern matching to construct distinct tokens
|
* This tokenizer uses regex pattern matching to construct distinct tokens
|
||||||
|
@ -103,41 +98,44 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
|
||||||
/**
|
/**
|
||||||
* Split the input using configured pattern
|
* Split the input using configured pattern
|
||||||
*/
|
*/
|
||||||
public TokenStream create(Reader input) {
|
public Tokenizer create(final Reader in) {
|
||||||
try {
|
try {
|
||||||
// Read the input into a single string
|
return new Tokenizer(in) {
|
||||||
String str = IOUtils.toString( input );
|
{init();}
|
||||||
|
|
||||||
Matcher matcher = pattern.matcher( str );
|
List<Token> tokens;
|
||||||
List<Token> tokens = (group < 0 )
|
Iterator<Token> iter;
|
||||||
? split( matcher, str )
|
|
||||||
: group( matcher, str, group );
|
void init() throws IOException {
|
||||||
|
// Read the input into a single string
|
||||||
final Iterator<Token> iter = tokens.iterator();
|
String str = IOUtils.toString( input );
|
||||||
return new TokenStream() {
|
|
||||||
@Override
|
Matcher matcher = pattern.matcher( str );
|
||||||
public boolean incrementToken() throws IOException {
|
tokens = (group < 0 )
|
||||||
return super.incrementToken();
|
? split( matcher, str )
|
||||||
|
: group( matcher, str, group );
|
||||||
|
iter = tokens.iterator();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// @Override
|
||||||
|
// public boolean incrementToken() throws IOException {
|
||||||
|
// return super.incrementToken();
|
||||||
|
// }
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void end() throws IOException {
|
public void end() throws IOException {
|
||||||
super.end();
|
super.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
// @Override
|
||||||
public Token next(Token reusableToken) throws IOException {
|
// public Token next(Token reusableToken) throws IOException {
|
||||||
return super.next(reusableToken);
|
// return super.next(reusableToken);
|
||||||
}
|
// }
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void reset() throws IOException {
|
public void reset(Reader input) throws IOException {
|
||||||
super.reset();
|
super.reset(input);
|
||||||
}
|
init();
|
||||||
|
|
||||||
@Override
|
|
||||||
public void close() throws IOException {
|
|
||||||
super.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -65,6 +65,6 @@ public interface TokenizerFactory {
|
||||||
public Map<String,String> getArgs();
|
public Map<String,String> getArgs();
|
||||||
|
|
||||||
/** Creates a TokenStream of the specified input */
|
/** Creates a TokenStream of the specified input */
|
||||||
public TokenStream create(Reader input);
|
public Tokenizer create(Reader input);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -47,7 +47,7 @@ public class TrieTokenizerFactory extends BaseTokenizerFactory {
|
||||||
this.precisionStep = precisionStep;
|
this.precisionStep = precisionStep;
|
||||||
}
|
}
|
||||||
|
|
||||||
public TokenStream create(Reader input) {
|
public Tokenizer create(Reader input) {
|
||||||
return new TrieTokenizer(input, type, precisionStep, TrieTokenizer.getNumericTokenStream(precisionStep));
|
return new TrieTokenizer(input, type, precisionStep, TrieTokenizer.getNumericTokenStream(precisionStep));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -466,6 +466,25 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void testTokenizer() {
|
||||||
|
|
||||||
|
assertU(adoc("id", "4055",
|
||||||
|
"patterntok", "Hello,There"));
|
||||||
|
assertU(adoc("id", "4056",
|
||||||
|
"patterntok", "Goodbye,Now"));
|
||||||
|
assertU(commit());
|
||||||
|
|
||||||
|
assertQ("make sure it split ok",
|
||||||
|
req("patterntok:Hello")
|
||||||
|
,"*[count(//doc)=1]"
|
||||||
|
);
|
||||||
|
assertQ("make sure it split ok",
|
||||||
|
req("patterntok:Goodbye")
|
||||||
|
,"*[count(//doc)=1]"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
public void testConfigDefaults() {
|
public void testConfigDefaults() {
|
||||||
assertU(adoc("id", "42",
|
assertU(adoc("id", "42",
|
||||||
"name", "Zapp Brannigan"));
|
"name", "Zapp Brannigan"));
|
||||||
|
|
|
@ -205,6 +205,11 @@
|
||||||
<tokenizer class="solr.KeywordTokenizerFactory"/>
|
<tokenizer class="solr.KeywordTokenizerFactory"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldtype>
|
</fieldtype>
|
||||||
|
<fieldtype name="patterntok" class="solr.TextField">
|
||||||
|
<analyzer>
|
||||||
|
<tokenizer class="solr.PatternTokenizerFactory" pattern=","/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldtype>
|
||||||
<fieldtype name="porterfilt" class="solr.TextField">
|
<fieldtype name="porterfilt" class="solr.TextField">
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
@ -422,6 +427,7 @@
|
||||||
<field name="standardtokfilt" type="standardtokfilt" indexed="true" stored="true"/>
|
<field name="standardtokfilt" type="standardtokfilt" indexed="true" stored="true"/>
|
||||||
<field name="standardfilt" type="standardfilt" indexed="true" stored="true"/>
|
<field name="standardfilt" type="standardfilt" indexed="true" stored="true"/>
|
||||||
<field name="lowerfilt" type="lowerfilt" indexed="true" stored="true"/>
|
<field name="lowerfilt" type="lowerfilt" indexed="true" stored="true"/>
|
||||||
|
<field name="patterntok" type="patterntok" indexed="true" stored="true"/>
|
||||||
<field name="patternreplacefilt" type="patternreplacefilt" indexed="true" stored="true"/>
|
<field name="patternreplacefilt" type="patternreplacefilt" indexed="true" stored="true"/>
|
||||||
<field name="porterfilt" type="porterfilt" indexed="true" stored="true"/>
|
<field name="porterfilt" type="porterfilt" indexed="true" stored="true"/>
|
||||||
<field name="engporterfilt" type="engporterfilt" indexed="true" stored="true"/>
|
<field name="engporterfilt" type="engporterfilt" indexed="true" stored="true"/>
|
||||||
|
|
Loading…
Reference in New Issue