SOLR-1930: remove analysis API deprecations

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1059901 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-01-17 12:34:39 +00:00
parent 6e66286f9d
commit 56026e37a2
11 changed files with 22 additions and 567 deletions

View File

@ -210,13 +210,14 @@
<fieldtype name="engporterfilt" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="custengporterfilt" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="stopfilt" class="solr.TextField">

View File

@ -1,197 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource; // javadoc @link
import java.io.IOException;
import java.util.LinkedList;
/**
* Handles input and output buffering of TokenStream
*
* <pre>
* // Example of a class implementing the rule "A" "B" => "Q" "B"
* class MyTokenStream extends BufferedTokenStream {
* public MyTokenStream(TokenStream input) {super(input);}
* protected Token process(Token t) throws IOException {
* if ("A".equals(t.termText())) {
* Token t2 = read();
* if (t2!=null && "B".equals(t2.termText())) t.setTermText("Q");
* if (t2!=null) pushBack(t2);
* }
* return t;
* }
* }
*
* // Example of a class implementing "A" "B" => "A" "A" "B"
* class MyTokenStream extends BufferedTokenStream {
* public MyTokenStream(TokenStream input) {super(input);}
* protected Token process(Token t) throws IOException {
* if ("A".equals(t.termText()) && "B".equals(peek(1).termText()))
* write((Token)t.clone());
* return t;
* }
* }
* </pre>
*
* NOTE: BufferedTokenStream does not clone() any Tokens. This is instead the
* responsibility of the implementing subclass. In the "A" "B" => "A" "A" "B"
* example above, the subclass must clone the additional "A" it creates.
*
* @deprecated This class does not support custom attributes. Extend TokenFilter instead,
* using {@link AttributeSource#captureState()} and {@link AttributeSource#restoreState(State)}
* which support all attributes.
*/
@Deprecated
public abstract class BufferedTokenStream extends TokenFilter {
// in the future, might be faster if we implemented as an array based CircularQueue
private final LinkedList<Token> inQueue = new LinkedList<Token>();
private final LinkedList<Token> outQueue = new LinkedList<Token>();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
public BufferedTokenStream(TokenStream input) {
super(input);
}
/**
* Process a token. Subclasses may read more tokens from the input stream,
* write more tokens to the output stream, or simply return the next token
* to be output. Subclasses may return null if the token is to be dropped.
* If a subclass writes tokens to the output stream and returns a
* non-null Token, the returned Token is considered to be at the head of
* the token output stream.
*/
protected abstract Token process(Token t) throws IOException;
public final boolean incrementToken() throws IOException {
while (true) {
if (!outQueue.isEmpty()) return writeToken(outQueue.removeFirst());
Token t = read();
if (null == t) return false;
Token out = process(t);
if (null != out) return writeToken(out);
// loop back to top in case process() put something on the output queue
}
}
/**
* Read a token from the buffered input stream.
* @return null at EOS
*/
protected Token read() throws IOException {
if (inQueue.isEmpty()) {
Token t = readToken();
return t;
}
return inQueue.removeFirst();
}
/**
* Push a token back into the buffered input stream, such that it will
* be returned by a future call to <code>read()</code>
*/
protected void pushBack(Token t) {
inQueue.addFirst(t);
}
/**
* Peek n tokens ahead in the buffered input stream, without modifying
* the stream.
* @param n Number of tokens into the input stream to peek, 1 based ...
* 0 is invalid
* @return a Token which exists in the input stream, any modifications
* made to this Token will be "real" if/when the Token is
* <code>read()</code> from the stream.
*/
protected Token peek(int n) throws IOException {
int fillCount = n-inQueue.size();
for (int i=0; i < fillCount; i++) {
Token t = readToken();
if (null==t) return null;
inQueue.addLast(t);
}
return inQueue.get(n-1);
}
/** old api emulation for back compat */
private Token readToken() throws IOException {
if (!input.incrementToken()) {
return null;
} else {
Token token = new Token();
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
token.setType(typeAtt.type());
token.setFlags(flagsAtt.getFlags());
token.setPositionIncrement(posIncAtt.getPositionIncrement());
token.setPayload(payloadAtt.getPayload());
return token;
}
}
/** old api emulation for back compat */
private boolean writeToken(Token token) throws IOException {
clearAttributes();
termAtt.copyBuffer(token.buffer(), 0, token.length());
offsetAtt.setOffset(token.startOffset(), token.endOffset());
typeAtt.setType(token.type());
flagsAtt.setFlags(token.getFlags());
posIncAtt.setPositionIncrement(token.getPositionIncrement());
payloadAtt.setPayload(token.getPayload());
return true;
}
/**
* Write a token to the buffered output stream
*/
protected void write(Token t) {
outQueue.addLast(t);
}
/**
* Provides direct Iterator access to the buffered output stream.
* Modifying any token in this Iterator will affect the resulting stream.
*/
protected Iterable<Token> output() {
return outQueue;
}
@Override
public void reset() throws IOException {
super.reset();
inQueue.clear();
outQueue.clear();
}
}

View File

@ -1,58 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import java.io.IOException;
/**
* @version $Id$
*
* @deprecated Use SnowballPorterFilterFactory with language="English" instead
*/
@Deprecated
public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
public static final String PROTECTED_TOKENS = "protected";
public void inform(ResourceLoader loader) {
String wordFiles = args.get(PROTECTED_TOKENS);
if (wordFiles != null) {
try {
protectedWords = getWordSet(loader, wordFiles, false);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
private CharArraySet protectedWords = null;
public TokenFilter create(TokenStream input) {
if (protectedWords != null)
input = new KeywordMarkerFilter(input, protectedWords);
return new SnowballFilter(input, new org.tartarus.snowball.ext.EnglishStemmer());
}
}

View File

@ -19,13 +19,9 @@ package org.apache.solr.analysis;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.pattern.PatternTokenizer;
import org.apache.solr.common.SolrException;
@ -104,65 +100,4 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, ex );
}
}
/**
* This behaves just like String.split( ), but returns a list of Tokens
* rather then an array of strings
* NOTE: This method is not used in 1.4.
* @deprecated
*/
@Deprecated
public static List<Token> split( Matcher matcher, String input )
{
int index = 0;
int lastNonEmptySize = Integer.MAX_VALUE;
ArrayList<Token> matchList = new ArrayList<Token>();
// Add segments before each match found
while(matcher.find()) {
String match = input.subSequence(index, matcher.start()).toString();
matchList.add( new Token( match, index, matcher.start()) );
index = matcher.end();
if( match.length() > 0 ) {
lastNonEmptySize = matchList.size();
}
}
// If no match is found, return the full string
if (index == 0) {
matchList.add( new Token( input, 0, input.length()) );
}
else {
String match = input.subSequence(index, input.length()).toString();
matchList.add( new Token( match, index, input.length()) );
if( match.length() > 0 ) {
lastNonEmptySize = matchList.size();
}
}
// Don't use trailing empty strings. This behavior matches String.split();
if( lastNonEmptySize < matchList.size() ) {
return matchList.subList( 0, lastNonEmptySize );
}
return matchList;
}
/**
* Create tokens from the matches in a matcher
* NOTE: This method is not used in 1.4.
* @deprecated
*/
@Deprecated
public static List<Token> group( Matcher matcher, String input, int group )
{
ArrayList<Token> matchList = new ArrayList<Token>();
while(matcher.find()) {
Token t = new Token(
matcher.group(group),
matcher.start(group),
matcher.end(group) );
matchList.add( t );
}
return matchList;
}
}

View File

@ -202,13 +202,14 @@
<fieldtype name="engporterfilt" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="custengporterfilt" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="stopfilt" class="solr.TextField">

View File

@ -193,13 +193,14 @@
<fieldtype name="engporterfilt" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="custengporterfilt" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="stopfilt" class="solr.TextField">

View File

@ -236,13 +236,14 @@
<fieldtype name="engporterfilt" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="custengporterfilt" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="stopfilt" class="solr.TextField">

View File

@ -252,13 +252,14 @@
<fieldtype name="engporterfilt" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="custengporterfilt" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="stopfilt" class="solr.TextField">
@ -286,14 +287,14 @@
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldtype>
@ -303,14 +304,14 @@
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1" generateNumberParts="0" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
<filter class="solr.StopFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.StopFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldtype>
@ -375,7 +376,7 @@
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
@ -384,7 +385,7 @@
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldType>
@ -397,7 +398,7 @@
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory"
synonyms="synonyms.txt" expand="true" />
<filter class="solr.EnglishPorterFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory" />
</analyzer>
</fieldtype>

View File

@ -1,106 +0,0 @@
package org.apache.solr.analysis;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.tartarus.snowball.ext.EnglishStemmer;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.ArrayList;
import java.util.Collections;
public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
public void test() throws IOException {
EnglishStemmer stemmer = new EnglishStemmer();
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
String[] gold = new String[test.length];
for (int i = 0; i < test.length; i++) {
stemmer.setCurrent(test[i]);
stemmer.stem();
gold[i] = stemmer.getCurrent();
}
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
factory.init(args);
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, gold);
}
public void testProtected() throws Exception {
EnglishStemmer stemmer = new EnglishStemmer();
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
String[] gold = new String[test.length];
for (int i = 0; i < test.length; i++) {
if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
stemmer.setCurrent(test[i]);
stemmer.stem();
gold[i] = stemmer.getCurrent();
} else {
gold[i] = test[i];
}
}
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put(EnglishPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
factory.init(args);
List<String> lines = new ArrayList<String>();
Collections.addAll(lines, "banks", "fledgling");
factory.inform(new LinesMockSolrResourceLoader(lines));
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, gold);
}
class LinesMockSolrResourceLoader implements ResourceLoader {
List<String> lines;
LinesMockSolrResourceLoader(List<String> lines) {
this.lines = lines;
}
public List<String> getLines(String resource) throws IOException {
return lines;
}
public Object newInstance(String cname, String... subpackages) {
return null;
}
public InputStream openResource(String resource) throws IOException {
return null;
}
}
}

View File

@ -33,7 +33,6 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.ArrayList;
import java.util.Collections;
public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
@ -59,37 +58,6 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
assertTokenStreamContents(stream, gold);
}
/**
* Tests the protected words mechanism of EnglishPorterFilterFactory
*/
@Deprecated
public void testProtectedOld() throws Exception {
EnglishStemmer stemmer = new EnglishStemmer();
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
String[] gold = new String[test.length];
for (int i = 0; i < test.length; i++) {
if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
stemmer.setCurrent(test[i]);
stemmer.stem();
gold[i] = stemmer.getCurrent();
} else {
gold[i] = test[i];
}
}
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put(SnowballPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
factory.init(args);
List<String> lines = new ArrayList<String>();
Collections.addAll(lines, "banks", "fledgling");
factory.inform(new LinesMockSolrResourceLoader(lines));
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, gold);
}
class LinesMockSolrResourceLoader implements ResourceLoader {
List<String> lines;

View File

@ -1,92 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.IOException;
import java.io.StringReader;
/**
* Test that BufferedTokenStream behaves as advertised in subclasses.
*/
public class TestBufferedTokenStream extends BaseTokenTestCase {
/** Example of a class implementing the rule "A" "B" => "Q" "B" */
public static class AB_Q_Stream extends BufferedTokenStream {
public AB_Q_Stream(TokenStream input) {super(input);}
protected Token process(Token t) throws IOException {
if ("A".equals(new String(t.buffer(), 0, t.length()))) {
Token t2 = read();
if (t2!=null && "B".equals(new String(t2.buffer(), 0, t2.length()))) t.setEmpty().append("Q");
if (t2!=null) pushBack(t2);
}
return t;
}
}
/** Example of a class implementing "A" "B" => "A" "A" "B" */
public static class AB_AAB_Stream extends BufferedTokenStream {
public AB_AAB_Stream(TokenStream input) {super(input);}
protected Token process(Token t) throws IOException {
if ("A".equals(new String(t.buffer(), 0, t.length())) &&
"B".equals(new String(peek(1).buffer(), 0, peek(1).length())))
write((Token)t.clone());
return t;
}
}
public void testABQ() throws Exception {
final String input = "How now A B brown A cow B like A B thing?";
final String expected = "How now Q B brown A cow B like Q B thing?";
TokenStream ts = new AB_Q_Stream
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
assertTokenStreamContents(ts, expected.split("\\s"));
}
public void testABAAB() throws Exception {
final String input = "How now A B brown A cow B like A B thing?";
final String expected = "How now A A B brown A cow B like A A B thing?";
TokenStream ts = new AB_AAB_Stream
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
assertTokenStreamContents(ts, expected.split("\\s"));
}
public void testReset() throws Exception {
final String input = "How now A B brown A cow B like A B thing?";
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
TokenStream ts = new AB_AAB_Stream(tokenizer);
CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
assertTrue(ts.incrementToken());
assertEquals("How", term.toString());
assertTrue(ts.incrementToken());
assertEquals("now", term.toString());
assertTrue(ts.incrementToken());
assertEquals("A", term.toString());
// reset back to input,
// if reset() does not work correctly then previous buffered tokens will remain
tokenizer.reset(new StringReader(input));
ts.reset();
assertTrue(ts.incrementToken());
assertEquals("How", term.toString());
}
}