mirror of https://github.com/apache/lucene.git
SOLR-1930: remove analysis API deprecations
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1059901 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6e66286f9d
commit
56026e37a2
|
@ -210,13 +210,14 @@
|
|||
<fieldtype name="engporterfilt" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="custengporterfilt" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="stopfilt" class="solr.TextField">
|
||||
|
|
|
@ -1,197 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource; // javadoc @link
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
|
||||
/**
|
||||
* Handles input and output buffering of TokenStream
|
||||
*
|
||||
* <pre>
|
||||
* // Example of a class implementing the rule "A" "B" => "Q" "B"
|
||||
* class MyTokenStream extends BufferedTokenStream {
|
||||
* public MyTokenStream(TokenStream input) {super(input);}
|
||||
* protected Token process(Token t) throws IOException {
|
||||
* if ("A".equals(t.termText())) {
|
||||
* Token t2 = read();
|
||||
* if (t2!=null && "B".equals(t2.termText())) t.setTermText("Q");
|
||||
* if (t2!=null) pushBack(t2);
|
||||
* }
|
||||
* return t;
|
||||
* }
|
||||
* }
|
||||
*
|
||||
* // Example of a class implementing "A" "B" => "A" "A" "B"
|
||||
* class MyTokenStream extends BufferedTokenStream {
|
||||
* public MyTokenStream(TokenStream input) {super(input);}
|
||||
* protected Token process(Token t) throws IOException {
|
||||
* if ("A".equals(t.termText()) && "B".equals(peek(1).termText()))
|
||||
* write((Token)t.clone());
|
||||
* return t;
|
||||
* }
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* NOTE: BufferedTokenStream does not clone() any Tokens. This is instead the
|
||||
* responsibility of the implementing subclass. In the "A" "B" => "A" "A" "B"
|
||||
* example above, the subclass must clone the additional "A" it creates.
|
||||
*
|
||||
* @deprecated This class does not support custom attributes. Extend TokenFilter instead,
|
||||
* using {@link AttributeSource#captureState()} and {@link AttributeSource#restoreState(State)}
|
||||
* which support all attributes.
|
||||
*/
|
||||
@Deprecated
|
||||
public abstract class BufferedTokenStream extends TokenFilter {
|
||||
// in the future, might be faster if we implemented as an array based CircularQueue
|
||||
private final LinkedList<Token> inQueue = new LinkedList<Token>();
|
||||
private final LinkedList<Token> outQueue = new LinkedList<Token>();
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
||||
private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
public BufferedTokenStream(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a token. Subclasses may read more tokens from the input stream,
|
||||
* write more tokens to the output stream, or simply return the next token
|
||||
* to be output. Subclasses may return null if the token is to be dropped.
|
||||
* If a subclass writes tokens to the output stream and returns a
|
||||
* non-null Token, the returned Token is considered to be at the head of
|
||||
* the token output stream.
|
||||
*/
|
||||
protected abstract Token process(Token t) throws IOException;
|
||||
|
||||
public final boolean incrementToken() throws IOException {
|
||||
while (true) {
|
||||
if (!outQueue.isEmpty()) return writeToken(outQueue.removeFirst());
|
||||
Token t = read();
|
||||
if (null == t) return false;
|
||||
Token out = process(t);
|
||||
if (null != out) return writeToken(out);
|
||||
// loop back to top in case process() put something on the output queue
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a token from the buffered input stream.
|
||||
* @return null at EOS
|
||||
*/
|
||||
protected Token read() throws IOException {
|
||||
if (inQueue.isEmpty()) {
|
||||
Token t = readToken();
|
||||
return t;
|
||||
}
|
||||
return inQueue.removeFirst();
|
||||
}
|
||||
|
||||
/**
|
||||
* Push a token back into the buffered input stream, such that it will
|
||||
* be returned by a future call to <code>read()</code>
|
||||
*/
|
||||
protected void pushBack(Token t) {
|
||||
inQueue.addFirst(t);
|
||||
}
|
||||
|
||||
/**
|
||||
* Peek n tokens ahead in the buffered input stream, without modifying
|
||||
* the stream.
|
||||
* @param n Number of tokens into the input stream to peek, 1 based ...
|
||||
* 0 is invalid
|
||||
* @return a Token which exists in the input stream, any modifications
|
||||
* made to this Token will be "real" if/when the Token is
|
||||
* <code>read()</code> from the stream.
|
||||
*/
|
||||
protected Token peek(int n) throws IOException {
|
||||
int fillCount = n-inQueue.size();
|
||||
for (int i=0; i < fillCount; i++) {
|
||||
Token t = readToken();
|
||||
if (null==t) return null;
|
||||
inQueue.addLast(t);
|
||||
}
|
||||
return inQueue.get(n-1);
|
||||
}
|
||||
|
||||
/** old api emulation for back compat */
|
||||
private Token readToken() throws IOException {
|
||||
if (!input.incrementToken()) {
|
||||
return null;
|
||||
} else {
|
||||
Token token = new Token();
|
||||
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
|
||||
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||
token.setType(typeAtt.type());
|
||||
token.setFlags(flagsAtt.getFlags());
|
||||
token.setPositionIncrement(posIncAtt.getPositionIncrement());
|
||||
token.setPayload(payloadAtt.getPayload());
|
||||
return token;
|
||||
}
|
||||
}
|
||||
|
||||
/** old api emulation for back compat */
|
||||
private boolean writeToken(Token token) throws IOException {
|
||||
clearAttributes();
|
||||
termAtt.copyBuffer(token.buffer(), 0, token.length());
|
||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||
typeAtt.setType(token.type());
|
||||
flagsAtt.setFlags(token.getFlags());
|
||||
posIncAtt.setPositionIncrement(token.getPositionIncrement());
|
||||
payloadAtt.setPayload(token.getPayload());
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a token to the buffered output stream
|
||||
*/
|
||||
protected void write(Token t) {
|
||||
outQueue.addLast(t);
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides direct Iterator access to the buffered output stream.
|
||||
* Modifying any token in this Iterator will affect the resulting stream.
|
||||
*/
|
||||
protected Iterable<Token> output() {
|
||||
return outQueue;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
inQueue.clear();
|
||||
outQueue.clear();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,58 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
*
|
||||
* @deprecated Use SnowballPorterFilterFactory with language="English" instead
|
||||
*/
|
||||
@Deprecated
|
||||
public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
public static final String PROTECTED_TOKENS = "protected";
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
String wordFiles = args.get(PROTECTED_TOKENS);
|
||||
if (wordFiles != null) {
|
||||
try {
|
||||
protectedWords = getWordSet(loader, wordFiles, false);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private CharArraySet protectedWords = null;
|
||||
|
||||
public TokenFilter create(TokenStream input) {
|
||||
if (protectedWords != null)
|
||||
input = new KeywordMarkerFilter(input, protectedWords);
|
||||
return new SnowballFilter(input, new org.tartarus.snowball.ext.EnglishStemmer());
|
||||
}
|
||||
|
||||
}
|
|
@ -19,13 +19,9 @@ package org.apache.solr.analysis;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.pattern.PatternTokenizer;
|
||||
import org.apache.solr.common.SolrException;
|
||||
|
@ -104,65 +100,4 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
|
|||
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, ex );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This behaves just like String.split( ), but returns a list of Tokens
|
||||
* rather then an array of strings
|
||||
* NOTE: This method is not used in 1.4.
|
||||
* @deprecated
|
||||
*/
|
||||
@Deprecated
|
||||
public static List<Token> split( Matcher matcher, String input )
|
||||
{
|
||||
int index = 0;
|
||||
int lastNonEmptySize = Integer.MAX_VALUE;
|
||||
ArrayList<Token> matchList = new ArrayList<Token>();
|
||||
|
||||
// Add segments before each match found
|
||||
while(matcher.find()) {
|
||||
String match = input.subSequence(index, matcher.start()).toString();
|
||||
matchList.add( new Token( match, index, matcher.start()) );
|
||||
index = matcher.end();
|
||||
if( match.length() > 0 ) {
|
||||
lastNonEmptySize = matchList.size();
|
||||
}
|
||||
}
|
||||
|
||||
// If no match is found, return the full string
|
||||
if (index == 0) {
|
||||
matchList.add( new Token( input, 0, input.length()) );
|
||||
}
|
||||
else {
|
||||
String match = input.subSequence(index, input.length()).toString();
|
||||
matchList.add( new Token( match, index, input.length()) );
|
||||
if( match.length() > 0 ) {
|
||||
lastNonEmptySize = matchList.size();
|
||||
}
|
||||
}
|
||||
|
||||
// Don't use trailing empty strings. This behavior matches String.split();
|
||||
if( lastNonEmptySize < matchList.size() ) {
|
||||
return matchList.subList( 0, lastNonEmptySize );
|
||||
}
|
||||
return matchList;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create tokens from the matches in a matcher
|
||||
* NOTE: This method is not used in 1.4.
|
||||
* @deprecated
|
||||
*/
|
||||
@Deprecated
|
||||
public static List<Token> group( Matcher matcher, String input, int group )
|
||||
{
|
||||
ArrayList<Token> matchList = new ArrayList<Token>();
|
||||
while(matcher.find()) {
|
||||
Token t = new Token(
|
||||
matcher.group(group),
|
||||
matcher.start(group),
|
||||
matcher.end(group) );
|
||||
matchList.add( t );
|
||||
}
|
||||
return matchList;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -202,13 +202,14 @@
|
|||
<fieldtype name="engporterfilt" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="custengporterfilt" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="stopfilt" class="solr.TextField">
|
||||
|
|
|
@ -193,13 +193,14 @@
|
|||
<fieldtype name="engporterfilt" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="custengporterfilt" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="stopfilt" class="solr.TextField">
|
||||
|
|
|
@ -236,13 +236,14 @@
|
|||
<fieldtype name="engporterfilt" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="custengporterfilt" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="stopfilt" class="solr.TextField">
|
||||
|
|
|
@ -252,13 +252,14 @@
|
|||
<fieldtype name="engporterfilt" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="custengporterfilt" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="stopfilt" class="solr.TextField">
|
||||
|
@ -286,14 +287,14 @@
|
|||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
|
@ -303,14 +304,14 @@
|
|||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1" generateNumberParts="0" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
|
||||
<filter class="solr.StopFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
||||
<filter class="solr.StopFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
|
@ -375,7 +376,7 @@
|
|||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
|
||||
catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
|
@ -384,7 +385,7 @@
|
|||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
|
||||
catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
@ -397,7 +398,7 @@
|
|||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.SynonymFilterFactory"
|
||||
synonyms="synonyms.txt" expand="true" />
|
||||
<filter class="solr.EnglishPorterFilterFactory"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
|
|
@ -1,106 +0,0 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.tartarus.snowball.ext.EnglishStemmer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
|
||||
public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
|
||||
|
||||
public void test() throws IOException {
|
||||
EnglishStemmer stemmer = new EnglishStemmer();
|
||||
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
|
||||
String[] gold = new String[test.length];
|
||||
for (int i = 0; i < test.length; i++) {
|
||||
stemmer.setCurrent(test[i]);
|
||||
stemmer.stem();
|
||||
gold[i] = stemmer.getCurrent();
|
||||
}
|
||||
|
||||
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
|
||||
factory.init(args);
|
||||
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
|
||||
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, gold);
|
||||
}
|
||||
|
||||
public void testProtected() throws Exception {
|
||||
EnglishStemmer stemmer = new EnglishStemmer();
|
||||
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
|
||||
String[] gold = new String[test.length];
|
||||
for (int i = 0; i < test.length; i++) {
|
||||
if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
|
||||
stemmer.setCurrent(test[i]);
|
||||
stemmer.stem();
|
||||
gold[i] = stemmer.getCurrent();
|
||||
} else {
|
||||
gold[i] = test[i];
|
||||
}
|
||||
}
|
||||
|
||||
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
args.put(EnglishPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
|
||||
factory.init(args);
|
||||
List<String> lines = new ArrayList<String>();
|
||||
Collections.addAll(lines, "banks", "fledgling");
|
||||
factory.inform(new LinesMockSolrResourceLoader(lines));
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
|
||||
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, gold);
|
||||
}
|
||||
|
||||
class LinesMockSolrResourceLoader implements ResourceLoader {
|
||||
List<String> lines;
|
||||
|
||||
LinesMockSolrResourceLoader(List<String> lines) {
|
||||
this.lines = lines;
|
||||
}
|
||||
|
||||
public List<String> getLines(String resource) throws IOException {
|
||||
return lines;
|
||||
}
|
||||
|
||||
public Object newInstance(String cname, String... subpackages) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public InputStream openResource(String resource) throws IOException {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -33,7 +33,6 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
|
||||
public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
|
||||
|
||||
|
@ -59,37 +58,6 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
|
|||
assertTokenStreamContents(stream, gold);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests the protected words mechanism of EnglishPorterFilterFactory
|
||||
*/
|
||||
@Deprecated
|
||||
public void testProtectedOld() throws Exception {
|
||||
EnglishStemmer stemmer = new EnglishStemmer();
|
||||
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
|
||||
String[] gold = new String[test.length];
|
||||
for (int i = 0; i < test.length; i++) {
|
||||
if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
|
||||
stemmer.setCurrent(test[i]);
|
||||
stemmer.stem();
|
||||
gold[i] = stemmer.getCurrent();
|
||||
} else {
|
||||
gold[i] = test[i];
|
||||
}
|
||||
}
|
||||
|
||||
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
args.put(SnowballPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
|
||||
factory.init(args);
|
||||
List<String> lines = new ArrayList<String>();
|
||||
Collections.addAll(lines, "banks", "fledgling");
|
||||
factory.inform(new LinesMockSolrResourceLoader(lines));
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
|
||||
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, gold);
|
||||
}
|
||||
|
||||
class LinesMockSolrResourceLoader implements ResourceLoader {
|
||||
List<String> lines;
|
||||
|
||||
|
|
|
@ -1,92 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
/**
|
||||
* Test that BufferedTokenStream behaves as advertised in subclasses.
|
||||
*/
|
||||
public class TestBufferedTokenStream extends BaseTokenTestCase {
|
||||
|
||||
/** Example of a class implementing the rule "A" "B" => "Q" "B" */
|
||||
public static class AB_Q_Stream extends BufferedTokenStream {
|
||||
public AB_Q_Stream(TokenStream input) {super(input);}
|
||||
protected Token process(Token t) throws IOException {
|
||||
if ("A".equals(new String(t.buffer(), 0, t.length()))) {
|
||||
Token t2 = read();
|
||||
if (t2!=null && "B".equals(new String(t2.buffer(), 0, t2.length()))) t.setEmpty().append("Q");
|
||||
if (t2!=null) pushBack(t2);
|
||||
}
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
/** Example of a class implementing "A" "B" => "A" "A" "B" */
|
||||
public static class AB_AAB_Stream extends BufferedTokenStream {
|
||||
public AB_AAB_Stream(TokenStream input) {super(input);}
|
||||
protected Token process(Token t) throws IOException {
|
||||
if ("A".equals(new String(t.buffer(), 0, t.length())) &&
|
||||
"B".equals(new String(peek(1).buffer(), 0, peek(1).length())))
|
||||
write((Token)t.clone());
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
public void testABQ() throws Exception {
|
||||
final String input = "How now A B brown A cow B like A B thing?";
|
||||
final String expected = "How now Q B brown A cow B like Q B thing?";
|
||||
TokenStream ts = new AB_Q_Stream
|
||||
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
|
||||
assertTokenStreamContents(ts, expected.split("\\s"));
|
||||
}
|
||||
|
||||
public void testABAAB() throws Exception {
|
||||
final String input = "How now A B brown A cow B like A B thing?";
|
||||
final String expected = "How now A A B brown A cow B like A A B thing?";
|
||||
TokenStream ts = new AB_AAB_Stream
|
||||
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
|
||||
assertTokenStreamContents(ts, expected.split("\\s"));
|
||||
}
|
||||
|
||||
public void testReset() throws Exception {
|
||||
final String input = "How now A B brown A cow B like A B thing?";
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
TokenStream ts = new AB_AAB_Stream(tokenizer);
|
||||
CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals("How", term.toString());
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals("now", term.toString());
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals("A", term.toString());
|
||||
// reset back to input,
|
||||
// if reset() does not work correctly then previous buffered tokens will remain
|
||||
tokenizer.reset(new StringReader(input));
|
||||
ts.reset();
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals("How", term.toString());
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue