mirror of https://github.com/apache/lucene.git
SOLR-330: Converted Solr tokenstreams to use Lucene's char[] capabilities
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@643465 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c124044825
commit
e2c2a8d240
|
@ -867,6 +867,7 @@ Optimizations
|
||||||
a single token per document (not multiValued & not tokenized) by using the
|
a single token per document (not multiValued & not tokenized) by using the
|
||||||
Lucene FieldCache entry for that field to tally term counts. The first request
|
Lucene FieldCache entry for that field to tally term counts. The first request
|
||||||
utilizing the FieldCache will take longer than subsequent ones.
|
utilizing the FieldCache will take longer than subsequent ones.
|
||||||
|
7. Converted TokenStreams to use Lucene's new char array based capabilities. (SOLR-330, gsingers)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
1. Fixed delete-by-id for field types who's indexed form is different
|
1. Fixed delete-by-id for field types who's indexed form is different
|
||||||
|
|
|
@ -55,7 +55,7 @@ import java.util.LinkedList;
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
public abstract class BufferedTokenStream extends TokenStream {
|
public abstract class BufferedTokenStream extends TokenStream {
|
||||||
// in the futute, might be faster if we implemented as an array based CircularQueue
|
// in the future, might be faster if we implemented as an array based CircularQueue
|
||||||
private final LinkedList<Token> inQueue = new LinkedList<Token>();
|
private final LinkedList<Token> inQueue = new LinkedList<Token>();
|
||||||
private final LinkedList<Token> outQueue = new LinkedList<Token>();
|
private final LinkedList<Token> outQueue = new LinkedList<Token>();
|
||||||
private final TokenStream input;
|
private final TokenStream input;
|
||||||
|
|
|
@ -17,97 +17,92 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||||
|
public static final String PROTECTED_TOKENS = "protected";
|
||||||
|
|
||||||
public void inform(ResourceLoader loader) {
|
public void inform(ResourceLoader loader) {
|
||||||
String wordFile = args.get("protected");
|
String wordFile = args.get(PROTECTED_TOKENS);
|
||||||
if (wordFile != null) {
|
if (wordFile != null) {
|
||||||
try {
|
try {
|
||||||
List<String> wlist = loader.getLines(wordFile);
|
List<String> wlist = loader.getLines(wordFile);
|
||||||
protectedWords = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]));
|
//This cast is safe in Lucene
|
||||||
|
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private Set protectedWords = null;
|
private CharArraySet protectedWords = null;
|
||||||
|
|
||||||
public EnglishPorterFilter create(TokenStream input) {
|
public EnglishPorterFilter create(TokenStream input) {
|
||||||
return new EnglishPorterFilter(input,protectedWords);
|
return new EnglishPorterFilter(input, protectedWords);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/** English Porter2 filter that doesn't use reflection to
|
/**
|
||||||
/* adapt lucene to the snowball stemmer code.
|
* English Porter2 filter that doesn't use reflection to
|
||||||
|
* adapt lucene to the snowball stemmer code.
|
||||||
*/
|
*/
|
||||||
class EnglishPorterFilter extends TokenFilter {
|
class EnglishPorterFilter extends TokenFilter {
|
||||||
private final Set protWords;
|
private final CharArraySet protWords;
|
||||||
private net.sf.snowball.ext.EnglishStemmer stemmer;
|
private net.sf.snowball.ext.EnglishStemmer stemmer;
|
||||||
|
|
||||||
public EnglishPorterFilter(TokenStream source, Set protWords) {
|
public EnglishPorterFilter(TokenStream source, CharArraySet protWords) {
|
||||||
super(source);
|
super(source);
|
||||||
this.protWords=protWords;
|
this.protWords = protWords;
|
||||||
stemmer = new net.sf.snowball.ext.EnglishStemmer();
|
stemmer = new net.sf.snowball.ext.EnglishStemmer();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/** the original code from lucene sandbox
|
/**
|
||||||
public final Token next() throws IOException {
|
* the original code from lucene sandbox
|
||||||
Token token = input.next();
|
* public final Token next() throws IOException {
|
||||||
if (token == null)
|
* Token token = input.next();
|
||||||
return null;
|
* if (token == null)
|
||||||
stemmer.setCurrent(token.termText());
|
* return null;
|
||||||
try {
|
* stemmer.setCurrent(token.termText());
|
||||||
stemMethod.invoke(stemmer, EMPTY_ARGS);
|
* try {
|
||||||
} catch (Exception e) {
|
* stemMethod.invoke(stemmer, EMPTY_ARGS);
|
||||||
throw new RuntimeException(e.toString());
|
* } catch (Exception e) {
|
||||||
}
|
* throw new RuntimeException(e.toString());
|
||||||
return new Token(stemmer.getCurrent(),
|
* }
|
||||||
token.startOffset(), token.endOffset(), token.type());
|
* return new Token(stemmer.getCurrent(),
|
||||||
}
|
* token.startOffset(), token.endOffset(), token.type());
|
||||||
**/
|
* }
|
||||||
|
*/
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Token next() throws IOException {
|
public Token next(Token token) throws IOException {
|
||||||
Token tok = input.next();
|
Token result = input.next(token);
|
||||||
if (tok==null) return null;
|
if (result != null) {
|
||||||
String tokstr = tok.termText();
|
char[] termBuffer = result.termBuffer();
|
||||||
|
int len = result.termLength();
|
||||||
// if protected, don't stem. use this to avoid stemming collisions.
|
// if protected, don't stem. use this to avoid stemming collisions.
|
||||||
if (protWords != null && protWords.contains(tokstr)) {
|
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
|
||||||
return tok;
|
return result;
|
||||||
|
}
|
||||||
|
stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array
|
||||||
|
stemmer.stem();
|
||||||
|
String newstr = stemmer.getCurrent();
|
||||||
|
result.setTermBuffer(newstr.toCharArray(), 0, newstr.length());
|
||||||
}
|
}
|
||||||
|
return result;
|
||||||
stemmer.setCurrent(tokstr);
|
|
||||||
stemmer.stem();
|
|
||||||
String newstr = stemmer.getCurrent();
|
|
||||||
if (tokstr.equals(newstr)) {
|
|
||||||
return tok;
|
|
||||||
} else {
|
|
||||||
// TODO: it would be nice if I could just set termText directly like
|
|
||||||
// lucene packages can.
|
|
||||||
Token newtok = new Token(newstr, tok.startOffset(), tok.endOffset(), tok.type());
|
|
||||||
newtok.setPositionIncrement(tok.getPositionIncrement());
|
|
||||||
return newtok;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -28,25 +28,26 @@ import org.apache.lucene.analysis.*;
|
||||||
* This filter should be used on indexing time only.
|
* This filter should be used on indexing time only.
|
||||||
* Example field definition in schema.xml:
|
* Example field definition in schema.xml:
|
||||||
* <pre>
|
* <pre>
|
||||||
* <fieldtype name="text" class="solr.TextField" positionIncrementGap="100">
|
* <fieldtype name="text" class="solr.TextField" positionIncrementGap="100">
|
||||||
* <analyzer type="index">
|
* <analyzer type="index">
|
||||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
* <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
* <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||||
* <filter class="solr.StopFilterFactory" ignoreCase="true"/>
|
* <filter class="solr.StopFilterFactory" ignoreCase="true"/>
|
||||||
* <filter class="solr.HyphenatedWordsFilterFactory"/>
|
* <filter class="solr.HyphenatedWordsFilterFactory"/>
|
||||||
* <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
* <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
||||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||||
* <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
* <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||||
* </analyzer>
|
* </analyzer>
|
||||||
* <analyzer type="query">
|
* <analyzer type="query">
|
||||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
* <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
* <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||||
* <filter class="solr.StopFilterFactory" ignoreCase="true"/>
|
* <filter class="solr.StopFilterFactory" ignoreCase="true"/>
|
||||||
* <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
|
* <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
|
||||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||||
* <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
* <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||||
* </analyzer>
|
* </analyzer>
|
||||||
* </fieldtype>
|
* </fieldtype>
|
||||||
|
* </pre>
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public final class HyphenatedWordsFilter extends TokenFilter {
|
public final class HyphenatedWordsFilter extends TokenFilter {
|
||||||
|
@ -55,16 +56,18 @@ public final class HyphenatedWordsFilter extends TokenFilter {
|
||||||
super(in);
|
super(in);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
|
|
||||||
|
/**
|
||||||
* @inheritDoc
|
* @inheritDoc
|
||||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||||
*/
|
*/
|
||||||
public final Token next() throws IOException {
|
public final Token next(Token in) throws IOException {
|
||||||
StringBuffer termText = new StringBuffer(25);
|
StringBuilder termText = new StringBuilder(25);
|
||||||
int startOffset = -1, firstPositionIncrement = -1, wordsMerged = 0;
|
int startOffset = -1, firstPositionIncrement = -1, wordsMerged = 0;
|
||||||
Token lastToken = null;
|
Token lastToken = null;
|
||||||
for (Token token = input.next(); token != null; token = input.next()) {
|
for (Token token = input.next(in); token != null; token = input.next()) {
|
||||||
termText.append(token.termText());
|
termText.append(token.termBuffer(), 0, token.termLength());
|
||||||
//current token ends with hyphen -> grab the next token and glue them together
|
//current token ends with hyphen -> grab the next token and glue them together
|
||||||
if (termText.charAt(termText.length() - 1) == '-') {
|
if (termText.charAt(termText.length() - 1) == '-') {
|
||||||
wordsMerged++;
|
wordsMerged++;
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
@ -32,23 +33,18 @@ import java.util.Set;
|
||||||
* @since solr 1.3
|
* @since solr 1.3
|
||||||
*/
|
*/
|
||||||
public final class KeepWordFilter extends TokenFilter {
|
public final class KeepWordFilter extends TokenFilter {
|
||||||
final Set<String> words;
|
final CharArraySet words;
|
||||||
final boolean ignoreCase;
|
|
||||||
|
|
||||||
public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
|
public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
|
||||||
super(in);
|
super(in);
|
||||||
this.words=words;
|
this.words = new CharArraySet(words, ignoreCase);
|
||||||
this.ignoreCase=ignoreCase;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final Token next() throws IOException {
|
public final Token next(Token in) throws IOException {
|
||||||
for (Token token=input.next(); token!=null; token=input.next()) {
|
for (Token token=input.next(in); token!=null; token=input.next()) {
|
||||||
String txt = ignoreCase
|
if( words.contains( token.termBuffer(), 0, token.termLength() ) ) {
|
||||||
? token.termText().toLowerCase()
|
|
||||||
: token.termText();
|
|
||||||
|
|
||||||
if( words.contains( txt ) ) {
|
|
||||||
return token;
|
return token;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,8 +36,8 @@ public final class LengthFilter extends TokenFilter {
|
||||||
//System.out.println("min="+min+" max="+max);
|
//System.out.println("min="+min+" max="+max);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Token next() throws IOException {
|
public final Token next(Token in) throws IOException {
|
||||||
for (Token token=input.next(); token!=null; token=input.next()) {
|
for (Token token=input.next(in); token!=null; token=input.next(in)) {
|
||||||
final int len = token.endOffset() - token.startOffset();
|
final int len = token.endOffset() - token.startOffset();
|
||||||
if (len<min || len>max) continue;
|
if (len<min || len>max) continue;
|
||||||
return token;
|
return token;
|
||||||
|
|
|
@ -27,12 +27,14 @@ import java.util.Map;
|
||||||
*/
|
*/
|
||||||
public class LengthFilterFactory extends BaseTokenFilterFactory {
|
public class LengthFilterFactory extends BaseTokenFilterFactory {
|
||||||
int min,max;
|
int min,max;
|
||||||
|
public static final String MIN_KEY = "min";
|
||||||
|
public static final String MAX_KEY = "max";
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void init(Map<String, String> args) {
|
public void init(Map<String, String> args) {
|
||||||
super.init(args);
|
super.init(args);
|
||||||
min=Integer.parseInt(args.get("min"));
|
min=Integer.parseInt(args.get(MIN_KEY));
|
||||||
max=Integer.parseInt(args.get("max"));
|
max=Integer.parseInt(args.get(MAX_KEY));
|
||||||
}
|
}
|
||||||
public LengthFilter create(TokenStream input) {
|
public LengthFilter create(TokenStream input) {
|
||||||
return new LengthFilter(input,min,max);
|
return new LengthFilter(input,min,max);
|
||||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Token;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.CharBuffer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A TokenFilter which applies a Pattern to each token in the stream,
|
* A TokenFilter which applies a Pattern to each token in the stream,
|
||||||
|
@ -64,12 +65,12 @@ public final class PatternReplaceFilter extends TokenFilter {
|
||||||
this.all=all;
|
this.all=all;
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Token next() throws IOException {
|
public final Token next(Token in) throws IOException {
|
||||||
Token t = input.next();
|
Token t = input.next(in);
|
||||||
if (t == null)
|
if (t == null)
|
||||||
return null;
|
return null;
|
||||||
|
CharSequence text = CharBuffer.wrap(t.termBuffer(), 0, t.termLength());
|
||||||
Matcher m = p.matcher(t.termText());
|
Matcher m = p.matcher(text);
|
||||||
if (all) {
|
if (all) {
|
||||||
t.setTermText(m.replaceAll(replacement));
|
t.setTermText(m.replaceAll(replacement));
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -46,29 +46,27 @@ public class PhoneticFilter extends TokenFilter
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final Token next() throws IOException {
|
public final Token next(Token in) throws IOException {
|
||||||
if( save != null ) {
|
if( save != null ) {
|
||||||
Token temp = save;
|
Token temp = save;
|
||||||
save = null;
|
save = null;
|
||||||
return temp;
|
return temp;
|
||||||
}
|
}
|
||||||
|
|
||||||
Token t = input.next();
|
Token t = input.next(in);
|
||||||
if( t != null ) {
|
if( t != null ) {
|
||||||
String value = t.termText();
|
String value = new String(t.termBuffer(), 0, t.termLength());
|
||||||
try {
|
try {
|
||||||
value = encoder.encode(t.termText()).toString();
|
value = encoder.encode(value).toString();
|
||||||
}
|
}
|
||||||
catch (Exception ignored) {} // just use the direct text
|
catch (Exception ignored) {} // just use the direct text
|
||||||
|
//Token m = new Token(value, t.startOffset(), t.endOffset(), name );
|
||||||
Token m = new Token(value, t.startOffset(), t.endOffset(), name );
|
|
||||||
if( inject ) {
|
if( inject ) {
|
||||||
m.setPositionIncrement(0);
|
save = (Token) t.clone();
|
||||||
save = m;
|
save.setPositionIncrement(0);
|
||||||
}
|
save.setTermBuffer(value.toCharArray(), 0, value.length());
|
||||||
else {
|
} else {
|
||||||
// replace the token rather then add it too the stream
|
t.setTermBuffer(value.toCharArray(), 0, value.length());
|
||||||
return m;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return t;
|
return t;
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.solr.util.ArraysUtils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
@ -30,23 +31,27 @@ public class RemoveDuplicatesTokenFilter extends BufferedTokenStream {
|
||||||
public RemoveDuplicatesTokenFilter(TokenStream input) {super(input);}
|
public RemoveDuplicatesTokenFilter(TokenStream input) {super(input);}
|
||||||
protected Token process(Token t) throws IOException {
|
protected Token process(Token t) throws IOException {
|
||||||
Token tok = read();
|
Token tok = read();
|
||||||
OUT: while (tok != null && tok.getPositionIncrement()==0) {
|
while (tok != null && tok.getPositionIncrement()==0) {
|
||||||
if (null != t) {
|
if (null != t) {
|
||||||
write(t);
|
write(t);
|
||||||
t = null;
|
t = null;
|
||||||
}
|
}
|
||||||
boolean dup=false;
|
boolean dup=false;
|
||||||
IN: for (Token outTok : output()) {
|
for (Token outTok : output()) {
|
||||||
if (outTok.termText().equals(tok.termText())) {
|
int tokLen = tok.termLength();
|
||||||
|
if (outTok.termLength() == tokLen && ArraysUtils.equals(outTok.termBuffer(), 0, tok.termBuffer(), 0, tokLen)) {
|
||||||
dup=true;
|
dup=true;
|
||||||
break IN;
|
//continue;;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!dup)
|
if (!dup){
|
||||||
write(tok);
|
write(tok);
|
||||||
|
}
|
||||||
tok = read();
|
tok = read();
|
||||||
}
|
}
|
||||||
if (tok != null) pushBack(tok);
|
if (tok != null) {
|
||||||
|
pushBack(tok);
|
||||||
|
}
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,9 +17,9 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
@ -29,50 +29,54 @@ import java.io.IOException;
|
||||||
* @version $Id:$
|
* @version $Id:$
|
||||||
*/
|
*/
|
||||||
public final class TrimFilter extends TokenFilter {
|
public final class TrimFilter extends TokenFilter {
|
||||||
|
|
||||||
final boolean updateOffsets;
|
final boolean updateOffsets;
|
||||||
|
|
||||||
public TrimFilter(TokenStream in, boolean updateOffsets ) {
|
public TrimFilter(TokenStream in, boolean updateOffsets) {
|
||||||
super(in);
|
super(in);
|
||||||
this.updateOffsets = updateOffsets;
|
this.updateOffsets = updateOffsets;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final Token next() throws IOException {
|
public final Token next(Token in) throws IOException {
|
||||||
Token t = input.next();
|
Token t = input.next(in);
|
||||||
if (null == t || null == t.termText())
|
if (null == t || null == t.termBuffer() || t.termLength() == 0){
|
||||||
return t;
|
return t;
|
||||||
|
}
|
||||||
|
char[] termBuffer = t.termBuffer();
|
||||||
|
int len = t.termLength();
|
||||||
|
int start = 0;
|
||||||
|
int end = 0;
|
||||||
|
int endOff = 0;
|
||||||
|
|
||||||
if( updateOffsets ) {
|
// eat the first characters
|
||||||
String txt = t.termText();
|
//QUESTION: Should we use Character.isWhitespace() instead?
|
||||||
int start = 0;
|
for (start = 0; start < len && termBuffer[start] <= ' '; start++) {
|
||||||
int end = txt.length();
|
|
||||||
int endOff = 0;
|
|
||||||
|
|
||||||
// eat the first characters
|
|
||||||
while ((start < end) && (txt.charAt(start) <= ' ')) {
|
|
||||||
start++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// eat the end characters
|
|
||||||
while ((start < end) && (txt.charAt(end-1) <= ' ')) {
|
|
||||||
end--;
|
|
||||||
endOff++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if( start > 0 || end < txt.length() ) {
|
|
||||||
int incr = t.getPositionIncrement();
|
|
||||||
t = new Token( t.termText().substring( start, end ),
|
|
||||||
t.startOffset()+start,
|
|
||||||
t.endOffset()-endOff,
|
|
||||||
t.type() );
|
|
||||||
|
|
||||||
t.setPositionIncrement( incr ); //+ start ); TODO? what should happen with the offset
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else {
|
// eat the end characters
|
||||||
t.setTermText( t.termText().trim() );
|
for (end = len; end >= start && termBuffer[end - 1] <= ' '; end--) {
|
||||||
|
endOff++;
|
||||||
}
|
}
|
||||||
|
if (start > 0 || end < len) {
|
||||||
|
if (start < end) {
|
||||||
|
t.setTermBuffer(t.termBuffer(), start, (end - start));
|
||||||
|
} else {
|
||||||
|
t.setTermLength(0);
|
||||||
|
}
|
||||||
|
if (updateOffsets) {
|
||||||
|
t.setStartOffset(t.startOffset() + start);
|
||||||
|
if (start < end) {
|
||||||
|
t.setEndOffset(t.endOffset() - endOff);
|
||||||
|
} //else if end is less than, start, then the term length is 0, so, no need to bother w/ the end offset
|
||||||
|
}
|
||||||
|
/*t = new Token( t.termText().substring( start, end ),
|
||||||
|
t.startOffset()+start,
|
||||||
|
t.endOffset()-endOff,
|
||||||
|
t.type() );*/
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -192,7 +192,7 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
// use the type of the first char as the type
|
// use the type of the first char as the type
|
||||||
// of the token.
|
// of the token.
|
||||||
private int tokType(Token t) {
|
private int tokType(Token t) {
|
||||||
return charType(t.termText().charAt(0));
|
return charType(t.termBuffer()[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// There isn't really an efficient queue class, so we will
|
// There isn't really an efficient queue class, so we will
|
||||||
|
@ -207,23 +207,22 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
private Token newTok(Token orig, int start, int end) {
|
private Token newTok(Token orig, int start, int end) {
|
||||||
int startOff = orig.startOffset();
|
int startOff = orig.startOffset();
|
||||||
int endOff = orig.endOffset();
|
int endOff = orig.endOffset();
|
||||||
String origStr = orig.termText();
|
|
||||||
|
|
||||||
// if length by start + end offsets doesn't match the term text then assume
|
// if length by start + end offsets doesn't match the term text then assume
|
||||||
// this is a synonym and don't adjust the offsets.
|
// this is a synonym and don't adjust the offsets.
|
||||||
if (origStr.length() == endOff-startOff) {
|
if (orig.termLength() == endOff-startOff) {
|
||||||
endOff = startOff + end;
|
endOff = startOff + end;
|
||||||
startOff += start;
|
startOff += start;
|
||||||
}
|
}
|
||||||
|
|
||||||
return new Token(orig.termText().substring(start,end),
|
Token newTok = new Token(startOff,
|
||||||
startOff,
|
|
||||||
endOff,
|
endOff,
|
||||||
orig.type());
|
orig.type());
|
||||||
|
newTok.setTermBuffer(orig.termBuffer(), start, (end - start));
|
||||||
|
return newTok;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public final Token next() throws IOException {
|
public final Token next(Token in) throws IOException {
|
||||||
|
|
||||||
// check the queue first
|
// check the queue first
|
||||||
if (queuePos<queue.size()) {
|
if (queuePos<queue.size()) {
|
||||||
|
@ -248,25 +247,25 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
Token t = input.next();
|
Token t = input.next();
|
||||||
if (t == null) return null;
|
if (t == null) return null;
|
||||||
|
|
||||||
String s = t.termText();
|
char [] termBuffer = t.termBuffer();
|
||||||
|
int len = t.termLength();
|
||||||
int start=0;
|
int start=0;
|
||||||
int end=s.length();
|
if (len ==0) continue;
|
||||||
if (end==0) continue;
|
|
||||||
|
|
||||||
origPosIncrement = t.getPositionIncrement();
|
origPosIncrement = t.getPositionIncrement();
|
||||||
|
|
||||||
// Avoid calling charType more than once for each char (basically
|
// Avoid calling charType more than once for each char (basically
|
||||||
// avoid any backtracking).
|
// avoid any backtracking).
|
||||||
// makes code slightly more difficult, but faster.
|
// makes code slightly more difficult, but faster.
|
||||||
int ch=s.charAt(start);
|
int ch=termBuffer[start];
|
||||||
int type=charType(ch);
|
int type=charType(ch);
|
||||||
|
|
||||||
int numWords=0;
|
int numWords=0;
|
||||||
|
|
||||||
while (start<end) {
|
while (start< len) {
|
||||||
// first eat delimiters at the start of this subword
|
// first eat delimiters at the start of this subword
|
||||||
while ((type & SUBWORD_DELIM)!=0 && ++start<end) {
|
while ((type & SUBWORD_DELIM)!=0 && ++start< len) {
|
||||||
ch=s.charAt(start);
|
ch=termBuffer[start];
|
||||||
type=charType(ch);
|
type=charType(ch);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -278,23 +277,23 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
int lastType=type; // type of the previously read char
|
int lastType=type; // type of the previously read char
|
||||||
|
|
||||||
|
|
||||||
while (pos<end) {
|
while (pos< len) {
|
||||||
|
|
||||||
if (type!=lastType) {
|
if (type!=lastType) {
|
||||||
// check and remove "'s" from the end of a token.
|
// check and remove "'s" from the end of a token.
|
||||||
// the pattern to check for is
|
// the pattern to check for is
|
||||||
// ALPHA "'" ("s"|"S") (SUBWORD_DELIM | END)
|
// ALPHA "'" ("s"|"S") (SUBWORD_DELIM | END)
|
||||||
if ((lastType & ALPHA)!=0) {
|
if ((lastType & ALPHA)!=0) {
|
||||||
if (ch=='\'' && pos+1<end
|
if (ch=='\'' && pos+1< len
|
||||||
&& (s.charAt(pos+1)=='s' || s.charAt(pos+1)=='S'))
|
&& (termBuffer[pos+1]=='s' || termBuffer[pos+1]=='S'))
|
||||||
{
|
{
|
||||||
int subWordEnd=pos;
|
int subWordEnd=pos;
|
||||||
if (pos+2>=end) {
|
if (pos+2>= len) {
|
||||||
// end of string detected after "'s"
|
// end of string detected after "'s"
|
||||||
pos+=2;
|
pos+=2;
|
||||||
} else {
|
} else {
|
||||||
// make sure that a delimiter follows "'s"
|
// make sure that a delimiter follows "'s"
|
||||||
int ch2 = s.charAt(pos+2);
|
int ch2 = termBuffer[pos+2];
|
||||||
int type2 = charType(ch2);
|
int type2 = charType(ch2);
|
||||||
if ((type2 & SUBWORD_DELIM)!=0) {
|
if ((type2 & SUBWORD_DELIM)!=0) {
|
||||||
// if delimiter, move position pointer
|
// if delimiter, move position pointer
|
||||||
|
@ -340,7 +339,7 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (++pos >= end) {
|
if (++pos >= len) {
|
||||||
if (start==0) {
|
if (start==0) {
|
||||||
// the subword is the whole original token, so
|
// the subword is the whole original token, so
|
||||||
// return it unchanged.
|
// return it unchanged.
|
||||||
|
@ -362,7 +361,7 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
lastType = type;
|
lastType = type;
|
||||||
ch = s.charAt(pos);
|
ch = termBuffer[pos];
|
||||||
type = charType(ch);
|
type = charType(ch);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -482,7 +481,7 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
tok = lst.get(i);
|
tok = lst.get(i);
|
||||||
if (catenateSubwords) {
|
if (catenateSubwords) {
|
||||||
if (i==start) firstTok=tok;
|
if (i==start) firstTok=tok;
|
||||||
sb.append(tok.termText());
|
sb.append(tok.termBuffer(), 0, tok.termLength());
|
||||||
}
|
}
|
||||||
if (generateSubwords) {
|
if (generateSubwords) {
|
||||||
queue.add(tok);
|
queue.add(tok);
|
||||||
|
|
|
@ -0,0 +1,35 @@
|
||||||
|
package org.apache.solr.util;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
*
|
||||||
|
**/
|
||||||
|
//Since Arrays.equals doesn't implement offsets for equals
|
||||||
|
public class ArraysUtils {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* See if two array slices are the same.
|
||||||
|
*
|
||||||
|
* @param left The left array to compare
|
||||||
|
* @param offsetLeft The offset into the array. Must be positive
|
||||||
|
* @param right The right array to compare
|
||||||
|
* @param offsetRight the offset into the right array. Must be positive
|
||||||
|
* @param length The length of the section of the array to compare
|
||||||
|
* @return true if the two arrays, starting at their respective offsets, are equal
|
||||||
|
*
|
||||||
|
* @see java.util.Arrays#equals(char[], char[])
|
||||||
|
*/
|
||||||
|
public static boolean equals(char[] left, int offsetLeft, char[] right, int offsetRight, int length) {
|
||||||
|
if ((offsetLeft + length <= left.length) && (offsetRight + length <= right.length)) {
|
||||||
|
for (int i = 0; i < length; i++) {
|
||||||
|
if (left[offsetLeft + i] != right[offsetRight + i]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,96 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import net.sf.snowball.ext.EnglishStemmer;
|
||||||
|
import org.apache.solr.common.ResourceLoader;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
|
||||||
|
public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
|
||||||
|
|
||||||
|
public void test() throws IOException {
|
||||||
|
EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer();
|
||||||
|
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
|
||||||
|
StringBuilder gold = new StringBuilder();
|
||||||
|
for (int i = 0; i < test.length; i++) {
|
||||||
|
stemmer.setCurrent(test[i]);
|
||||||
|
stemmer.stem();
|
||||||
|
gold.append(stemmer.getCurrent()).append(' ');
|
||||||
|
}
|
||||||
|
|
||||||
|
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
|
||||||
|
Map<String, String> args = new HashMap<String, String>();
|
||||||
|
|
||||||
|
factory.init(args);
|
||||||
|
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
|
||||||
|
String out = tsToString(factory.create(new IterTokenStream(test)));
|
||||||
|
assertEquals(gold.toString().trim(), out);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testProtected() throws Exception {
|
||||||
|
EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer();
|
||||||
|
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
|
||||||
|
StringBuilder gold = new StringBuilder();
|
||||||
|
for (int i = 0; i < test.length; i++) {
|
||||||
|
if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
|
||||||
|
stemmer.setCurrent(test[i]);
|
||||||
|
stemmer.stem();
|
||||||
|
gold.append(stemmer.getCurrent()).append(' ');
|
||||||
|
} else {
|
||||||
|
gold.append(test[i]).append(' ');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
|
||||||
|
Map<String, String> args = new HashMap<String, String>();
|
||||||
|
args.put(EnglishPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
|
||||||
|
factory.init(args);
|
||||||
|
List<String> lines = new ArrayList<String>();
|
||||||
|
Collections.addAll(lines, "banks", "fledgling");
|
||||||
|
factory.inform(new LinesMockSolrResourceLoader(lines));
|
||||||
|
String out = tsToString(factory.create(new IterTokenStream(test)));
|
||||||
|
assertEquals(gold.toString().trim(), out);
|
||||||
|
}
|
||||||
|
|
||||||
|
class LinesMockSolrResourceLoader implements ResourceLoader {
|
||||||
|
List<String> lines;
|
||||||
|
|
||||||
|
LinesMockSolrResourceLoader(List<String> lines) {
|
||||||
|
this.lines = lines;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getLines(String resource) throws IOException {
|
||||||
|
return lines;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object newInstance(String cname, String... subpackages) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public InputStream openResource(String resource) throws IOException {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class LengthFilterTest extends BaseTokenTestCase {
|
||||||
|
|
||||||
|
public void test() throws IOException {
|
||||||
|
LengthFilterFactory factory = new LengthFilterFactory();
|
||||||
|
Map<String, String> args = new HashMap<String, String>();
|
||||||
|
args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4));
|
||||||
|
args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
|
||||||
|
factory.init(args);
|
||||||
|
String[] test = {"foo", "foobar", "super-duper-trooper"};
|
||||||
|
String gold = "foobar";
|
||||||
|
String out = tsToString(factory.create(new IterTokenStream(test)));
|
||||||
|
assertEquals(gold.toString(), out);
|
||||||
|
}
|
||||||
|
}
|
|
@ -27,8 +27,8 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
*/
|
*/
|
||||||
public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
|
public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
|
||||||
public void testHyphenatedWords() throws Exception {
|
public void testHyphenatedWords() throws Exception {
|
||||||
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on";
|
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
|
||||||
String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on";
|
String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on and ecological";
|
||||||
// first test
|
// first test
|
||||||
TokenStream ts = new WhitespaceTokenizer(new StringReader(input));
|
TokenStream ts = new WhitespaceTokenizer(new StringReader(input));
|
||||||
ts = new HyphenatedWordsFilter(ts);
|
ts = new HyphenatedWordsFilter(ts);
|
||||||
|
|
|
@ -17,76 +17,96 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import java.io.StringReader;
|
import org.apache.lucene.analysis.Token;
|
||||||
import java.util.regex.Pattern;
|
|
||||||
import junit.framework.TestCase;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @version $Id:$
|
* @version $Id:$
|
||||||
*/
|
*/
|
||||||
public class TestPatternReplaceFilter extends AnalysisTestCase {
|
public class TestPatternReplaceFilter extends AnalysisTestCase {
|
||||||
|
|
||||||
public void testReplaceAll() throws Exception {
|
public void testReplaceAll() throws Exception {
|
||||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||||
TokenStream ts = new PatternReplaceFilter
|
TokenStream ts = new PatternReplaceFilter
|
||||||
(new WhitespaceTokenizer(new StringReader(input)),
|
(new WhitespaceTokenizer(new StringReader(input)),
|
||||||
Pattern.compile("a*b"),
|
Pattern.compile("a*b"),
|
||||||
"-", true);
|
"-", true);
|
||||||
assertEquals("-foo-foo-foo-", ts.next().termText());
|
Token token = ts.next();
|
||||||
assertEquals("-", ts.next().termText());
|
assertEquals("-foo-foo-foo-", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
assertEquals("c-", ts.next().termText());
|
token = ts.next();
|
||||||
assertNull(ts.next());
|
assertEquals("-", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
|
token = ts.next();
|
||||||
|
assertEquals("c-", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
|
token = ts.next();
|
||||||
|
assertNull(token);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReplaceFirst() throws Exception {
|
public void testReplaceFirst() throws Exception {
|
||||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||||
TokenStream ts = new PatternReplaceFilter
|
TokenStream ts = new PatternReplaceFilter
|
||||||
(new WhitespaceTokenizer(new StringReader(input)),
|
(new WhitespaceTokenizer(new StringReader(input)),
|
||||||
Pattern.compile("a*b"),
|
Pattern.compile("a*b"),
|
||||||
"-", false);
|
"-", false);
|
||||||
assertEquals("-fooaabfooabfoob", ts.next().termText());
|
Token token = ts.next();
|
||||||
assertEquals("-", ts.next().termText());
|
assertEquals("-fooaabfooabfoob", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
assertEquals("c-", ts.next().termText());
|
token = ts.next();
|
||||||
assertNull(ts.next());
|
assertEquals("-", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
|
token = ts.next();
|
||||||
|
assertEquals("c-", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
|
token = ts.next();
|
||||||
|
assertNull(token);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testStripFirst() throws Exception {
|
public void testStripFirst() throws Exception {
|
||||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||||
TokenStream ts = new PatternReplaceFilter
|
TokenStream ts = new PatternReplaceFilter
|
||||||
(new WhitespaceTokenizer(new StringReader(input)),
|
(new WhitespaceTokenizer(new StringReader(input)),
|
||||||
Pattern.compile("a*b"),
|
Pattern.compile("a*b"),
|
||||||
null, false);
|
null, false);
|
||||||
assertEquals("fooaabfooabfoob", ts.next().termText());
|
Token token = ts.next();
|
||||||
assertEquals("", ts.next().termText());
|
assertEquals("fooaabfooabfoob", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
assertEquals("c", ts.next().termText());
|
token = ts.next();
|
||||||
assertNull(ts.next());
|
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
|
token = ts.next();
|
||||||
|
assertEquals("c", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
|
token = ts.next();
|
||||||
|
assertNull(token);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testStripAll() throws Exception {
|
public void testStripAll() throws Exception {
|
||||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||||
TokenStream ts = new PatternReplaceFilter
|
TokenStream ts = new PatternReplaceFilter
|
||||||
(new WhitespaceTokenizer(new StringReader(input)),
|
(new WhitespaceTokenizer(new StringReader(input)),
|
||||||
Pattern.compile("a*b"),
|
Pattern.compile("a*b"),
|
||||||
null, true);
|
null, true);
|
||||||
assertEquals("foofoofoo", ts.next().termText());
|
Token token = ts.next();
|
||||||
assertEquals("", ts.next().termText());
|
assertEquals("foofoofoo", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
assertEquals("c", ts.next().termText());
|
token = ts.next();
|
||||||
assertNull(ts.next());
|
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
|
token = ts.next();
|
||||||
|
assertEquals("c", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
|
token = ts.next();
|
||||||
|
assertNull(token);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReplaceAllWithBackRef() throws Exception {
|
public void testReplaceAllWithBackRef() throws Exception {
|
||||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||||
TokenStream ts = new PatternReplaceFilter
|
TokenStream ts = new PatternReplaceFilter
|
||||||
(new WhitespaceTokenizer(new StringReader(input)),
|
(new WhitespaceTokenizer(new StringReader(input)),
|
||||||
Pattern.compile("(a*)b"),
|
Pattern.compile("(a*)b"),
|
||||||
"$1\\$", true);
|
"$1\\$", true);
|
||||||
assertEquals("aa$fooaa$fooa$foo$", ts.next().termText());
|
Token token = ts.next();
|
||||||
assertEquals("a$", ts.next().termText());
|
assertEquals("aa$fooaa$fooa$foo$", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
assertEquals("caaaaaaaaa$", ts.next().termText());
|
token = ts.next();
|
||||||
assertNull(ts.next());
|
assertEquals("a$", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
|
token = ts.next();
|
||||||
|
assertEquals("caaaaaaaaa$", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
|
token = ts.next();
|
||||||
|
assertNull(token);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -81,8 +81,8 @@ public class TestPhoneticFilter extends BaseTokenTestCase {
|
||||||
new IterTokenStream(stream.iterator()), enc, "text", inject );
|
new IterTokenStream(stream.iterator()), enc, "text", inject );
|
||||||
|
|
||||||
for( Token t : output ) {
|
for( Token t : output ) {
|
||||||
Token got = filter.next();
|
Token got = filter.next(t);
|
||||||
assertEquals( t.termText(), got.termText());
|
assertEquals( new String(t.termBuffer(), 0, t.termLength()), new String(got.termBuffer(), 0, got.termLength()));
|
||||||
}
|
}
|
||||||
assertNull( filter.next() ); // no more tokens
|
assertNull( filter.next() ); // no more tokens
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,11 +35,16 @@ public class TestTrimFilter extends BaseTokenTestCase {
|
||||||
new Token("cCc",11,15),
|
new Token("cCc",11,15),
|
||||||
new Token(" ",16,20)), false );
|
new Token(" ",16,20)), false );
|
||||||
|
|
||||||
assertEquals("a", ts.next().termText());
|
Token token = ts.next();
|
||||||
assertEquals("b", ts.next().termText());
|
assertEquals("a", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
assertEquals("cCc", ts.next().termText());
|
token = ts.next();
|
||||||
assertEquals("", ts.next().termText());
|
assertEquals("b", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
assertNull(ts.next());
|
token = ts.next();
|
||||||
|
assertEquals("cCc", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
|
token = ts.next();
|
||||||
|
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
|
token = ts.next();
|
||||||
|
assertNull(token);
|
||||||
|
|
||||||
ts = new TrimFilter( new IterTokenStream(
|
ts = new TrimFilter( new IterTokenStream(
|
||||||
new Token(" a", 0,2),
|
new Token(" a", 0,2),
|
||||||
|
|
|
@ -0,0 +1,48 @@
|
||||||
|
package org.apache.solr.util;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
public class ArraysUtilsTest extends TestCase {
|
||||||
|
|
||||||
|
|
||||||
|
public ArraysUtilsTest(String s) {
|
||||||
|
super(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void setUp() {
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void tearDown() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void test() {
|
||||||
|
String left = "this is equal";
|
||||||
|
String right = left;
|
||||||
|
char[] leftChars = left.toCharArray();
|
||||||
|
char[] rightChars = right.toCharArray();
|
||||||
|
assertTrue(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 0, rightChars, 0, left.length()));
|
||||||
|
|
||||||
|
assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 1, rightChars, 0, left.length()));
|
||||||
|
assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 1, rightChars, 2, left.length()));
|
||||||
|
|
||||||
|
assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 25, rightChars, 0, left.length()));
|
||||||
|
assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 12, rightChars, 0, left.length()));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue