SOLR-330: Converted Solr tokenstreams to use Lucene's char[] capabilities

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@643465 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2008-04-01 16:10:19 +00:00
parent c124044825
commit e2c2a8d240
20 changed files with 469 additions and 225 deletions

View File

@ -867,6 +867,7 @@ Optimizations
a single token per document (not multiValued & not tokenized) by using the
Lucene FieldCache entry for that field to tally term counts. The first request
utilizing the FieldCache will take longer than subsequent ones.
7. Converted TokenStreams to use Lucene's new char array based capabilities. (SOLR-330, gsingers)
Bug Fixes
1. Fixed delete-by-id for field types who's indexed form is different

View File

@ -55,7 +55,7 @@ import java.util.LinkedList;
* @version $Id$
*/
public abstract class BufferedTokenStream extends TokenStream {
// in the futute, might be faster if we implemented as an array based CircularQueue
// in the future, might be faster if we implemented as an array based CircularQueue
private final LinkedList<Token> inQueue = new LinkedList<Token>();
private final LinkedList<Token> outQueue = new LinkedList<Token>();
private final TokenStream input;

View File

@ -17,97 +17,92 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Token;
import java.util.List;
import java.util.Set;
import java.io.IOException;
import java.util.List;
/**
* @version $Id$
*/
public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
public static final String PROTECTED_TOKENS = "protected";
public void inform(ResourceLoader loader) {
String wordFile = args.get("protected");
String wordFile = args.get(PROTECTED_TOKENS);
if (wordFile != null) {
try {
List<String> wlist = loader.getLines(wordFile);
protectedWords = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]));
//This cast is safe in Lucene
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
private Set protectedWords = null;
private CharArraySet protectedWords = null;
public EnglishPorterFilter create(TokenStream input) {
return new EnglishPorterFilter(input,protectedWords);
return new EnglishPorterFilter(input, protectedWords);
}
}
/** English Porter2 filter that doesn't use reflection to
/* adapt lucene to the snowball stemmer code.
/**
* English Porter2 filter that doesn't use reflection to
* adapt lucene to the snowball stemmer code.
*/
class EnglishPorterFilter extends TokenFilter {
private final Set protWords;
private final CharArraySet protWords;
private net.sf.snowball.ext.EnglishStemmer stemmer;
public EnglishPorterFilter(TokenStream source, Set protWords) {
public EnglishPorterFilter(TokenStream source, CharArraySet protWords) {
super(source);
this.protWords=protWords;
this.protWords = protWords;
stemmer = new net.sf.snowball.ext.EnglishStemmer();
}
/** the original code from lucene sandbox
public final Token next() throws IOException {
Token token = input.next();
if (token == null)
return null;
stemmer.setCurrent(token.termText());
try {
stemMethod.invoke(stemmer, EMPTY_ARGS);
} catch (Exception e) {
throw new RuntimeException(e.toString());
}
return new Token(stemmer.getCurrent(),
token.startOffset(), token.endOffset(), token.type());
}
**/
/**
* the original code from lucene sandbox
* public final Token next() throws IOException {
* Token token = input.next();
* if (token == null)
* return null;
* stemmer.setCurrent(token.termText());
* try {
* stemMethod.invoke(stemmer, EMPTY_ARGS);
* } catch (Exception e) {
* throw new RuntimeException(e.toString());
* }
* return new Token(stemmer.getCurrent(),
* token.startOffset(), token.endOffset(), token.type());
* }
*/
@Override
public Token next() throws IOException {
Token tok = input.next();
if (tok==null) return null;
String tokstr = tok.termText();
// if protected, don't stem. use this to avoid stemming collisions.
if (protWords != null && protWords.contains(tokstr)) {
return tok;
public Token next(Token token) throws IOException {
Token result = input.next(token);
if (result != null) {
char[] termBuffer = result.termBuffer();
int len = result.termLength();
// if protected, don't stem. use this to avoid stemming collisions.
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
return result;
}
stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array
stemmer.stem();
String newstr = stemmer.getCurrent();
result.setTermBuffer(newstr.toCharArray(), 0, newstr.length());
}
stemmer.setCurrent(tokstr);
stemmer.stem();
String newstr = stemmer.getCurrent();
if (tokstr.equals(newstr)) {
return tok;
} else {
// TODO: it would be nice if I could just set termText directly like
// lucene packages can.
Token newtok = new Token(newstr, tok.startOffset(), tok.endOffset(), tok.type());
newtok.setPositionIncrement(tok.getPositionIncrement());
return newtok;
}
return result;
}
}

View File

@ -28,25 +28,26 @@ import org.apache.lucene.analysis.*;
* This filter should be used on indexing time only.
* Example field definition in schema.xml:
* <pre>
* <fieldtype name="text" class="solr.TextField" positionIncrementGap="100">
* <analyzer type="index">
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
* <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
* <filter class="solr.StopFilterFactory" ignoreCase="true"/>
* <filter class="solr.HyphenatedWordsFilterFactory"/>
* <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
* <filter class="solr.LowerCaseFilterFactory"/>
* <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
* </analyzer>
* <analyzer type="query">
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
* <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
* <filter class="solr.StopFilterFactory" ignoreCase="true"/>
* <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
* <filter class="solr.LowerCaseFilterFactory"/>
* <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
* </analyzer>
* </fieldtype>
* &lt;fieldtype name="text" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer type="index"&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/&gt;
* &lt;filter class="solr.StopFilterFactory" ignoreCase="true"/&gt;
* &lt;filter class="solr.HyphenatedWordsFilterFactory"/&gt;
* &lt;filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/&gt;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.RemoveDuplicatesTokenFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;analyzer type="query"&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/&gt;
* &lt;filter class="solr.StopFilterFactory" ignoreCase="true"/&gt;
* &lt;filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/&gt;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.RemoveDuplicatesTokenFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldtype&gt;
* </pre>
*
*/
public final class HyphenatedWordsFilter extends TokenFilter {
@ -55,16 +56,18 @@ public final class HyphenatedWordsFilter extends TokenFilter {
super(in);
}
/**
/**
* @inheritDoc
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public final Token next() throws IOException {
StringBuffer termText = new StringBuffer(25);
public final Token next(Token in) throws IOException {
StringBuilder termText = new StringBuilder(25);
int startOffset = -1, firstPositionIncrement = -1, wordsMerged = 0;
Token lastToken = null;
for (Token token = input.next(); token != null; token = input.next()) {
termText.append(token.termText());
for (Token token = input.next(in); token != null; token = input.next()) {
termText.append(token.termBuffer(), 0, token.termLength());
//current token ends with hyphen -> grab the next token and glue them together
if (termText.charAt(termText.length() - 1) == '-') {
wordsMerged++;

View File

@ -20,6 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.CharArraySet;
import java.io.IOException;
import java.util.Set;
@ -32,23 +33,18 @@ import java.util.Set;
* @since solr 1.3
*/
public final class KeepWordFilter extends TokenFilter {
final Set<String> words;
final boolean ignoreCase;
final CharArraySet words;
public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
super(in);
this.words=words;
this.ignoreCase=ignoreCase;
this.words = new CharArraySet(words, ignoreCase);
}
@Override
public final Token next() throws IOException {
for (Token token=input.next(); token!=null; token=input.next()) {
String txt = ignoreCase
? token.termText().toLowerCase()
: token.termText();
if( words.contains( txt ) ) {
public final Token next(Token in) throws IOException {
for (Token token=input.next(in); token!=null; token=input.next()) {
if( words.contains( token.termBuffer(), 0, token.termLength() ) ) {
return token;
}
}

View File

@ -36,8 +36,8 @@ public final class LengthFilter extends TokenFilter {
//System.out.println("min="+min+" max="+max);
}
public final Token next() throws IOException {
for (Token token=input.next(); token!=null; token=input.next()) {
public final Token next(Token in) throws IOException {
for (Token token=input.next(in); token!=null; token=input.next(in)) {
final int len = token.endOffset() - token.startOffset();
if (len<min || len>max) continue;
return token;

View File

@ -27,12 +27,14 @@ import java.util.Map;
*/
public class LengthFilterFactory extends BaseTokenFilterFactory {
int min,max;
public static final String MIN_KEY = "min";
public static final String MAX_KEY = "max";
@Override
public void init(Map<String, String> args) {
super.init(args);
min=Integer.parseInt(args.get("min"));
max=Integer.parseInt(args.get("max"));
min=Integer.parseInt(args.get(MIN_KEY));
max=Integer.parseInt(args.get(MAX_KEY));
}
public LengthFilter create(TokenStream input) {
return new LengthFilter(input,min,max);

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Token;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.io.IOException;
import java.nio.CharBuffer;
/**
* A TokenFilter which applies a Pattern to each token in the stream,
@ -64,12 +65,12 @@ public final class PatternReplaceFilter extends TokenFilter {
this.all=all;
}
public final Token next() throws IOException {
Token t = input.next();
public final Token next(Token in) throws IOException {
Token t = input.next(in);
if (t == null)
return null;
Matcher m = p.matcher(t.termText());
CharSequence text = CharBuffer.wrap(t.termBuffer(), 0, t.termLength());
Matcher m = p.matcher(text);
if (all) {
t.setTermText(m.replaceAll(replacement));
} else {

View File

@ -46,29 +46,27 @@ public class PhoneticFilter extends TokenFilter
}
@Override
public final Token next() throws IOException {
public final Token next(Token in) throws IOException {
if( save != null ) {
Token temp = save;
save = null;
return temp;
}
Token t = input.next();
Token t = input.next(in);
if( t != null ) {
String value = t.termText();
String value = new String(t.termBuffer(), 0, t.termLength());
try {
value = encoder.encode(t.termText()).toString();
value = encoder.encode(value).toString();
}
catch (Exception ignored) {} // just use the direct text
Token m = new Token(value, t.startOffset(), t.endOffset(), name );
//Token m = new Token(value, t.startOffset(), t.endOffset(), name );
if( inject ) {
m.setPositionIncrement(0);
save = m;
}
else {
// replace the token rather then add it too the stream
return m;
save = (Token) t.clone();
save.setPositionIncrement(0);
save.setTermBuffer(value.toCharArray(), 0, value.length());
} else {
t.setTermBuffer(value.toCharArray(), 0, value.length());
}
}
return t;

View File

@ -19,6 +19,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.util.ArraysUtils;
import java.io.IOException;
@ -30,23 +31,27 @@ public class RemoveDuplicatesTokenFilter extends BufferedTokenStream {
public RemoveDuplicatesTokenFilter(TokenStream input) {super(input);}
protected Token process(Token t) throws IOException {
Token tok = read();
OUT: while (tok != null && tok.getPositionIncrement()==0) {
while (tok != null && tok.getPositionIncrement()==0) {
if (null != t) {
write(t);
t = null;
}
boolean dup=false;
IN: for (Token outTok : output()) {
if (outTok.termText().equals(tok.termText())) {
for (Token outTok : output()) {
int tokLen = tok.termLength();
if (outTok.termLength() == tokLen && ArraysUtils.equals(outTok.termBuffer(), 0, tok.termBuffer(), 0, tokLen)) {
dup=true;
break IN;
//continue;;
}
}
if (!dup)
if (!dup){
write(tok);
}
tok = read();
}
if (tok != null) pushBack(tok);
if (tok != null) {
pushBack(tok);
}
return t;
}
}

View File

@ -17,9 +17,9 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import java.io.IOException;
@ -29,50 +29,54 @@ import java.io.IOException;
* @version $Id:$
*/
public final class TrimFilter extends TokenFilter {
final boolean updateOffsets;
public TrimFilter(TokenStream in, boolean updateOffsets ) {
public TrimFilter(TokenStream in, boolean updateOffsets) {
super(in);
this.updateOffsets = updateOffsets;
}
@Override
public final Token next() throws IOException {
Token t = input.next();
if (null == t || null == t.termText())
public final Token next(Token in) throws IOException {
Token t = input.next(in);
if (null == t || null == t.termBuffer() || t.termLength() == 0){
return t;
}
char[] termBuffer = t.termBuffer();
int len = t.termLength();
int start = 0;
int end = 0;
int endOff = 0;
if( updateOffsets ) {
String txt = t.termText();
int start = 0;
int end = txt.length();
int endOff = 0;
// eat the first characters
while ((start < end) && (txt.charAt(start) <= ' ')) {
start++;
}
// eat the end characters
while ((start < end) && (txt.charAt(end-1) <= ' ')) {
end--;
endOff++;
}
if( start > 0 || end < txt.length() ) {
int incr = t.getPositionIncrement();
t = new Token( t.termText().substring( start, end ),
t.startOffset()+start,
t.endOffset()-endOff,
t.type() );
t.setPositionIncrement( incr ); //+ start ); TODO? what should happen with the offset
}
// eat the first characters
//QUESTION: Should we use Character.isWhitespace() instead?
for (start = 0; start < len && termBuffer[start] <= ' '; start++) {
}
else {
t.setTermText( t.termText().trim() );
// eat the end characters
for (end = len; end >= start && termBuffer[end - 1] <= ' '; end--) {
endOff++;
}
if (start > 0 || end < len) {
if (start < end) {
t.setTermBuffer(t.termBuffer(), start, (end - start));
} else {
t.setTermLength(0);
}
if (updateOffsets) {
t.setStartOffset(t.startOffset() + start);
if (start < end) {
t.setEndOffset(t.endOffset() - endOff);
} //else if end is less than, start, then the term length is 0, so, no need to bother w/ the end offset
}
/*t = new Token( t.termText().substring( start, end ),
t.startOffset()+start,
t.endOffset()-endOff,
t.type() );*/
}
return t;
}
}

View File

@ -192,7 +192,7 @@ final class WordDelimiterFilter extends TokenFilter {
// use the type of the first char as the type
// of the token.
private int tokType(Token t) {
return charType(t.termText().charAt(0));
return charType(t.termBuffer()[0]);
}
// There isn't really an efficient queue class, so we will
@ -207,23 +207,22 @@ final class WordDelimiterFilter extends TokenFilter {
private Token newTok(Token orig, int start, int end) {
int startOff = orig.startOffset();
int endOff = orig.endOffset();
String origStr = orig.termText();
// if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets.
if (origStr.length() == endOff-startOff) {
if (orig.termLength() == endOff-startOff) {
endOff = startOff + end;
startOff += start;
}
return new Token(orig.termText().substring(start,end),
startOff,
Token newTok = new Token(startOff,
endOff,
orig.type());
newTok.setTermBuffer(orig.termBuffer(), start, (end - start));
return newTok;
}
public final Token next() throws IOException {
public final Token next(Token in) throws IOException {
// check the queue first
if (queuePos<queue.size()) {
@ -248,25 +247,25 @@ final class WordDelimiterFilter extends TokenFilter {
Token t = input.next();
if (t == null) return null;
String s = t.termText();
char [] termBuffer = t.termBuffer();
int len = t.termLength();
int start=0;
int end=s.length();
if (end==0) continue;
if (len ==0) continue;
origPosIncrement = t.getPositionIncrement();
// Avoid calling charType more than once for each char (basically
// avoid any backtracking).
// makes code slightly more difficult, but faster.
int ch=s.charAt(start);
int ch=termBuffer[start];
int type=charType(ch);
int numWords=0;
while (start<end) {
while (start< len) {
// first eat delimiters at the start of this subword
while ((type & SUBWORD_DELIM)!=0 && ++start<end) {
ch=s.charAt(start);
while ((type & SUBWORD_DELIM)!=0 && ++start< len) {
ch=termBuffer[start];
type=charType(ch);
}
@ -278,23 +277,23 @@ final class WordDelimiterFilter extends TokenFilter {
int lastType=type; // type of the previously read char
while (pos<end) {
while (pos< len) {
if (type!=lastType) {
// check and remove "'s" from the end of a token.
// the pattern to check for is
// ALPHA "'" ("s"|"S") (SUBWORD_DELIM | END)
if ((lastType & ALPHA)!=0) {
if (ch=='\'' && pos+1<end
&& (s.charAt(pos+1)=='s' || s.charAt(pos+1)=='S'))
if (ch=='\'' && pos+1< len
&& (termBuffer[pos+1]=='s' || termBuffer[pos+1]=='S'))
{
int subWordEnd=pos;
if (pos+2>=end) {
if (pos+2>= len) {
// end of string detected after "'s"
pos+=2;
} else {
// make sure that a delimiter follows "'s"
int ch2 = s.charAt(pos+2);
int ch2 = termBuffer[pos+2];
int type2 = charType(ch2);
if ((type2 & SUBWORD_DELIM)!=0) {
// if delimiter, move position pointer
@ -340,7 +339,7 @@ final class WordDelimiterFilter extends TokenFilter {
}
}
if (++pos >= end) {
if (++pos >= len) {
if (start==0) {
// the subword is the whole original token, so
// return it unchanged.
@ -362,7 +361,7 @@ final class WordDelimiterFilter extends TokenFilter {
}
lastType = type;
ch = s.charAt(pos);
ch = termBuffer[pos];
type = charType(ch);
}
@ -482,7 +481,7 @@ final class WordDelimiterFilter extends TokenFilter {
tok = lst.get(i);
if (catenateSubwords) {
if (i==start) firstTok=tok;
sb.append(tok.termText());
sb.append(tok.termBuffer(), 0, tok.termLength());
}
if (generateSubwords) {
queue.add(tok);

View File

@ -0,0 +1,35 @@
package org.apache.solr.util;
/**
*
*
**/
//Since Arrays.equals doesn't implement offsets for equals
public class ArraysUtils {
/**
* See if two array slices are the same.
*
* @param left The left array to compare
* @param offsetLeft The offset into the array. Must be positive
* @param right The right array to compare
* @param offsetRight the offset into the right array. Must be positive
* @param length The length of the section of the array to compare
* @return true if the two arrays, starting at their respective offsets, are equal
*
* @see java.util.Arrays#equals(char[], char[])
*/
public static boolean equals(char[] left, int offsetLeft, char[] right, int offsetRight, int length) {
if ((offsetLeft + length <= left.length) && (offsetRight + length <= right.length)) {
for (int i = 0; i < length; i++) {
if (left[offsetLeft + i] != right[offsetRight + i]) {
return false;
}
}
return true;
}
return false;
}
}

View File

@ -0,0 +1,96 @@
package org.apache.solr.analysis;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import net.sf.snowball.ext.EnglishStemmer;
import org.apache.solr.common.ResourceLoader;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.ArrayList;
import java.util.Collections;
public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
public void test() throws IOException {
EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer();
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
StringBuilder gold = new StringBuilder();
for (int i = 0; i < test.length; i++) {
stemmer.setCurrent(test[i]);
stemmer.stem();
gold.append(stemmer.getCurrent()).append(' ');
}
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
Map<String, String> args = new HashMap<String, String>();
factory.init(args);
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
String out = tsToString(factory.create(new IterTokenStream(test)));
assertEquals(gold.toString().trim(), out);
}
public void testProtected() throws Exception {
EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer();
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
StringBuilder gold = new StringBuilder();
for (int i = 0; i < test.length; i++) {
if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
stemmer.setCurrent(test[i]);
stemmer.stem();
gold.append(stemmer.getCurrent()).append(' ');
} else {
gold.append(test[i]).append(' ');
}
}
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
Map<String, String> args = new HashMap<String, String>();
args.put(EnglishPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
factory.init(args);
List<String> lines = new ArrayList<String>();
Collections.addAll(lines, "banks", "fledgling");
factory.inform(new LinesMockSolrResourceLoader(lines));
String out = tsToString(factory.create(new IterTokenStream(test)));
assertEquals(gold.toString().trim(), out);
}
class LinesMockSolrResourceLoader implements ResourceLoader {
List<String> lines;
LinesMockSolrResourceLoader(List<String> lines) {
this.lines = lines;
}
public List<String> getLines(String resource) throws IOException {
return lines;
}
public Object newInstance(String cname, String... subpackages) {
return null;
}
public InputStream openResource(String resource) throws IOException {
return null;
}
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class LengthFilterTest extends BaseTokenTestCase {
public void test() throws IOException {
LengthFilterFactory factory = new LengthFilterFactory();
Map<String, String> args = new HashMap<String, String>();
args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4));
args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
factory.init(args);
String[] test = {"foo", "foobar", "super-duper-trooper"};
String gold = "foobar";
String out = tsToString(factory.create(new IterTokenStream(test)));
assertEquals(gold.toString(), out);
}
}

View File

@ -27,8 +27,8 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
*/
public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
public void testHyphenatedWords() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on";
String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on";
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on and ecological";
// first test
TokenStream ts = new WhitespaceTokenizer(new StringReader(input));
ts = new HyphenatedWordsFilter(ts);

View File

@ -17,76 +17,96 @@
package org.apache.solr.analysis;
import java.io.StringReader;
import java.util.regex.Pattern;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import java.io.StringReader;
import java.util.regex.Pattern;
/**
* @version $Id:$
*/
public class TestPatternReplaceFilter extends AnalysisTestCase {
public void testReplaceAll() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"),
"-", true);
assertEquals("-foo-foo-foo-", ts.next().termText());
assertEquals("-", ts.next().termText());
assertEquals("c-", ts.next().termText());
assertNull(ts.next());
(new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"),
"-", true);
Token token = ts.next();
assertEquals("-foo-foo-foo-", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("-", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("c-", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
}
public void testReplaceFirst() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"),
"-", false);
assertEquals("-fooaabfooabfoob", ts.next().termText());
assertEquals("-", ts.next().termText());
assertEquals("c-", ts.next().termText());
assertNull(ts.next());
(new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"),
"-", false);
Token token = ts.next();
assertEquals("-fooaabfooabfoob", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("-", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("c-", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
}
public void testStripFirst() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"),
null, false);
assertEquals("fooaabfooabfoob", ts.next().termText());
assertEquals("", ts.next().termText());
assertEquals("c", ts.next().termText());
assertNull(ts.next());
(new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"),
null, false);
Token token = ts.next();
assertEquals("fooaabfooabfoob", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("c", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
}
public void testStripAll() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"),
null, true);
assertEquals("foofoofoo", ts.next().termText());
assertEquals("", ts.next().termText());
assertEquals("c", ts.next().termText());
assertNull(ts.next());
(new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"),
null, true);
Token token = ts.next();
assertEquals("foofoofoo", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("c", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
}
public void testReplaceAllWithBackRef() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("(a*)b"),
"$1\\$", true);
assertEquals("aa$fooaa$fooa$foo$", ts.next().termText());
assertEquals("a$", ts.next().termText());
assertEquals("caaaaaaaaa$", ts.next().termText());
assertNull(ts.next());
(new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("(a*)b"),
"$1\\$", true);
Token token = ts.next();
assertEquals("aa$fooaa$fooa$foo$", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("a$", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("caaaaaaaaa$", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
}
}

View File

@ -81,8 +81,8 @@ public class TestPhoneticFilter extends BaseTokenTestCase {
new IterTokenStream(stream.iterator()), enc, "text", inject );
for( Token t : output ) {
Token got = filter.next();
assertEquals( t.termText(), got.termText());
Token got = filter.next(t);
assertEquals( new String(t.termBuffer(), 0, t.termLength()), new String(got.termBuffer(), 0, got.termLength()));
}
assertNull( filter.next() ); // no more tokens
}

View File

@ -35,11 +35,16 @@ public class TestTrimFilter extends BaseTokenTestCase {
new Token("cCc",11,15),
new Token(" ",16,20)), false );
assertEquals("a", ts.next().termText());
assertEquals("b", ts.next().termText());
assertEquals("cCc", ts.next().termText());
assertEquals("", ts.next().termText());
assertNull(ts.next());
Token token = ts.next();
assertEquals("a", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("b", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("cCc", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
ts = new TrimFilter( new IterTokenStream(
new Token(" a", 0,2),

View File

@ -0,0 +1,48 @@
package org.apache.solr.util;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import junit.framework.TestCase;
public class ArraysUtilsTest extends TestCase {
public ArraysUtilsTest(String s) {
super(s);
}
protected void setUp() {
}
protected void tearDown() {
}
public void test() {
String left = "this is equal";
String right = left;
char[] leftChars = left.toCharArray();
char[] rightChars = right.toCharArray();
assertTrue(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 0, rightChars, 0, left.length()));
assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 1, rightChars, 0, left.length()));
assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 1, rightChars, 2, left.length()));
assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 25, rightChars, 0, left.length()));
assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 12, rightChars, 0, left.length()));
}
}