SOLR-330: Converted Solr tokenstreams to use Lucene's char[] capabilities

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@643465 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2008-04-01 16:10:19 +00:00
parent c124044825
commit e2c2a8d240
20 changed files with 469 additions and 225 deletions

View File

@ -867,6 +867,7 @@ Optimizations
a single token per document (not multiValued & not tokenized) by using the a single token per document (not multiValued & not tokenized) by using the
Lucene FieldCache entry for that field to tally term counts. The first request Lucene FieldCache entry for that field to tally term counts. The first request
utilizing the FieldCache will take longer than subsequent ones. utilizing the FieldCache will take longer than subsequent ones.
7. Converted TokenStreams to use Lucene's new char array based capabilities. (SOLR-330, gsingers)
Bug Fixes Bug Fixes
1. Fixed delete-by-id for field types who's indexed form is different 1. Fixed delete-by-id for field types who's indexed form is different

View File

@ -55,7 +55,7 @@ import java.util.LinkedList;
* @version $Id$ * @version $Id$
*/ */
public abstract class BufferedTokenStream extends TokenStream { public abstract class BufferedTokenStream extends TokenStream {
// in the futute, might be faster if we implemented as an array based CircularQueue // in the future, might be faster if we implemented as an array based CircularQueue
private final LinkedList<Token> inQueue = new LinkedList<Token>(); private final LinkedList<Token> inQueue = new LinkedList<Token>();
private final LinkedList<Token> outQueue = new LinkedList<Token>(); private final LinkedList<Token> outQueue = new LinkedList<Token>();
private final TokenStream input; private final TokenStream input;

View File

@ -17,97 +17,92 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Token;
import java.util.List;
import java.util.Set;
import java.io.IOException; import java.io.IOException;
import java.util.List;
/** /**
* @version $Id$ * @version $Id$
*/ */
public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
public static final String PROTECTED_TOKENS = "protected";
public void inform(ResourceLoader loader) { public void inform(ResourceLoader loader) {
String wordFile = args.get("protected"); String wordFile = args.get(PROTECTED_TOKENS);
if (wordFile != null) { if (wordFile != null) {
try { try {
List<String> wlist = loader.getLines(wordFile); List<String> wlist = loader.getLines(wordFile);
protectedWords = StopFilter.makeStopSet((String[])wlist.toArray(new String[0])); //This cast is safe in Lucene
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
} }
private Set protectedWords = null; private CharArraySet protectedWords = null;
public EnglishPorterFilter create(TokenStream input) { public EnglishPorterFilter create(TokenStream input) {
return new EnglishPorterFilter(input,protectedWords); return new EnglishPorterFilter(input, protectedWords);
} }
} }
/** English Porter2 filter that doesn't use reflection to /**
/* adapt lucene to the snowball stemmer code. * English Porter2 filter that doesn't use reflection to
* adapt lucene to the snowball stemmer code.
*/ */
class EnglishPorterFilter extends TokenFilter { class EnglishPorterFilter extends TokenFilter {
private final Set protWords; private final CharArraySet protWords;
private net.sf.snowball.ext.EnglishStemmer stemmer; private net.sf.snowball.ext.EnglishStemmer stemmer;
public EnglishPorterFilter(TokenStream source, Set protWords) { public EnglishPorterFilter(TokenStream source, CharArraySet protWords) {
super(source); super(source);
this.protWords=protWords; this.protWords = protWords;
stemmer = new net.sf.snowball.ext.EnglishStemmer(); stemmer = new net.sf.snowball.ext.EnglishStemmer();
} }
/** the original code from lucene sandbox /**
public final Token next() throws IOException { * the original code from lucene sandbox
Token token = input.next(); * public final Token next() throws IOException {
if (token == null) * Token token = input.next();
return null; * if (token == null)
stemmer.setCurrent(token.termText()); * return null;
try { * stemmer.setCurrent(token.termText());
stemMethod.invoke(stemmer, EMPTY_ARGS); * try {
} catch (Exception e) { * stemMethod.invoke(stemmer, EMPTY_ARGS);
throw new RuntimeException(e.toString()); * } catch (Exception e) {
} * throw new RuntimeException(e.toString());
return new Token(stemmer.getCurrent(), * }
token.startOffset(), token.endOffset(), token.type()); * return new Token(stemmer.getCurrent(),
} * token.startOffset(), token.endOffset(), token.type());
**/ * }
*/
@Override @Override
public Token next() throws IOException { public Token next(Token token) throws IOException {
Token tok = input.next(); Token result = input.next(token);
if (tok==null) return null; if (result != null) {
String tokstr = tok.termText(); char[] termBuffer = result.termBuffer();
int len = result.termLength();
// if protected, don't stem. use this to avoid stemming collisions. // if protected, don't stem. use this to avoid stemming collisions.
if (protWords != null && protWords.contains(tokstr)) { if (protWords != null && protWords.contains(termBuffer, 0, len)) {
return tok; return result;
}
stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array
stemmer.stem();
String newstr = stemmer.getCurrent();
result.setTermBuffer(newstr.toCharArray(), 0, newstr.length());
} }
return result;
stemmer.setCurrent(tokstr);
stemmer.stem();
String newstr = stemmer.getCurrent();
if (tokstr.equals(newstr)) {
return tok;
} else {
// TODO: it would be nice if I could just set termText directly like
// lucene packages can.
Token newtok = new Token(newstr, tok.startOffset(), tok.endOffset(), tok.type());
newtok.setPositionIncrement(tok.getPositionIncrement());
return newtok;
}
} }
} }

View File

@ -28,25 +28,26 @@ import org.apache.lucene.analysis.*;
* This filter should be used on indexing time only. * This filter should be used on indexing time only.
* Example field definition in schema.xml: * Example field definition in schema.xml:
* <pre> * <pre>
* <fieldtype name="text" class="solr.TextField" positionIncrementGap="100"> * &lt;fieldtype name="text" class="solr.TextField" positionIncrementGap="100"&gt;
* <analyzer type="index"> * &lt;analyzer type="index"&gt;
* <tokenizer class="solr.WhitespaceTokenizerFactory"/> * &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> * &lt;filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/&gt;
* <filter class="solr.StopFilterFactory" ignoreCase="true"/> * &lt;filter class="solr.StopFilterFactory" ignoreCase="true"/&gt;
* <filter class="solr.HyphenatedWordsFilterFactory"/> * &lt;filter class="solr.HyphenatedWordsFilterFactory"/&gt;
* <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/> * &lt;filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/&gt;
* <filter class="solr.LowerCaseFilterFactory"/> * &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> * &lt;filter class="solr.RemoveDuplicatesTokenFilterFactory"/&gt;
* </analyzer> * &lt;/analyzer&gt;
* <analyzer type="query"> * &lt;analyzer type="query"&gt;
* <tokenizer class="solr.WhitespaceTokenizerFactory"/> * &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> * &lt;filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/&gt;
* <filter class="solr.StopFilterFactory" ignoreCase="true"/> * &lt;filter class="solr.StopFilterFactory" ignoreCase="true"/&gt;
* <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/> * &lt;filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/&gt;
* <filter class="solr.LowerCaseFilterFactory"/> * &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> * &lt;filter class="solr.RemoveDuplicatesTokenFilterFactory"/&gt;
* </analyzer> * &lt;/analyzer&gt;
* </fieldtype> * &lt;/fieldtype&gt;
* </pre>
* *
*/ */
public final class HyphenatedWordsFilter extends TokenFilter { public final class HyphenatedWordsFilter extends TokenFilter {
@ -55,16 +56,18 @@ public final class HyphenatedWordsFilter extends TokenFilter {
super(in); super(in);
} }
/**
/**
* @inheritDoc * @inheritDoc
* @see org.apache.lucene.analysis.TokenStream#next() * @see org.apache.lucene.analysis.TokenStream#next()
*/ */
public final Token next() throws IOException { public final Token next(Token in) throws IOException {
StringBuffer termText = new StringBuffer(25); StringBuilder termText = new StringBuilder(25);
int startOffset = -1, firstPositionIncrement = -1, wordsMerged = 0; int startOffset = -1, firstPositionIncrement = -1, wordsMerged = 0;
Token lastToken = null; Token lastToken = null;
for (Token token = input.next(); token != null; token = input.next()) { for (Token token = input.next(in); token != null; token = input.next()) {
termText.append(token.termText()); termText.append(token.termBuffer(), 0, token.termLength());
//current token ends with hyphen -> grab the next token and glue them together //current token ends with hyphen -> grab the next token and glue them together
if (termText.charAt(termText.length() - 1) == '-') { if (termText.charAt(termText.length() - 1) == '-') {
wordsMerged++; wordsMerged++;

View File

@ -20,6 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.CharArraySet;
import java.io.IOException; import java.io.IOException;
import java.util.Set; import java.util.Set;
@ -32,23 +33,18 @@ import java.util.Set;
* @since solr 1.3 * @since solr 1.3
*/ */
public final class KeepWordFilter extends TokenFilter { public final class KeepWordFilter extends TokenFilter {
final Set<String> words; final CharArraySet words;
final boolean ignoreCase;
public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) { public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
super(in); super(in);
this.words=words; this.words = new CharArraySet(words, ignoreCase);
this.ignoreCase=ignoreCase;
} }
@Override @Override
public final Token next() throws IOException { public final Token next(Token in) throws IOException {
for (Token token=input.next(); token!=null; token=input.next()) { for (Token token=input.next(in); token!=null; token=input.next()) {
String txt = ignoreCase if( words.contains( token.termBuffer(), 0, token.termLength() ) ) {
? token.termText().toLowerCase()
: token.termText();
if( words.contains( txt ) ) {
return token; return token;
} }
} }

View File

@ -36,8 +36,8 @@ public final class LengthFilter extends TokenFilter {
//System.out.println("min="+min+" max="+max); //System.out.println("min="+min+" max="+max);
} }
public final Token next() throws IOException { public final Token next(Token in) throws IOException {
for (Token token=input.next(); token!=null; token=input.next()) { for (Token token=input.next(in); token!=null; token=input.next(in)) {
final int len = token.endOffset() - token.startOffset(); final int len = token.endOffset() - token.startOffset();
if (len<min || len>max) continue; if (len<min || len>max) continue;
return token; return token;

View File

@ -27,12 +27,14 @@ import java.util.Map;
*/ */
public class LengthFilterFactory extends BaseTokenFilterFactory { public class LengthFilterFactory extends BaseTokenFilterFactory {
int min,max; int min,max;
public static final String MIN_KEY = "min";
public static final String MAX_KEY = "max";
@Override @Override
public void init(Map<String, String> args) { public void init(Map<String, String> args) {
super.init(args); super.init(args);
min=Integer.parseInt(args.get("min")); min=Integer.parseInt(args.get(MIN_KEY));
max=Integer.parseInt(args.get("max")); max=Integer.parseInt(args.get(MAX_KEY));
} }
public LengthFilter create(TokenStream input) { public LengthFilter create(TokenStream input) {
return new LengthFilter(input,min,max); return new LengthFilter(input,min,max);

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Token;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.io.IOException; import java.io.IOException;
import java.nio.CharBuffer;
/** /**
* A TokenFilter which applies a Pattern to each token in the stream, * A TokenFilter which applies a Pattern to each token in the stream,
@ -64,12 +65,12 @@ public final class PatternReplaceFilter extends TokenFilter {
this.all=all; this.all=all;
} }
public final Token next() throws IOException { public final Token next(Token in) throws IOException {
Token t = input.next(); Token t = input.next(in);
if (t == null) if (t == null)
return null; return null;
CharSequence text = CharBuffer.wrap(t.termBuffer(), 0, t.termLength());
Matcher m = p.matcher(t.termText()); Matcher m = p.matcher(text);
if (all) { if (all) {
t.setTermText(m.replaceAll(replacement)); t.setTermText(m.replaceAll(replacement));
} else { } else {

View File

@ -46,29 +46,27 @@ public class PhoneticFilter extends TokenFilter
} }
@Override @Override
public final Token next() throws IOException { public final Token next(Token in) throws IOException {
if( save != null ) { if( save != null ) {
Token temp = save; Token temp = save;
save = null; save = null;
return temp; return temp;
} }
Token t = input.next(); Token t = input.next(in);
if( t != null ) { if( t != null ) {
String value = t.termText(); String value = new String(t.termBuffer(), 0, t.termLength());
try { try {
value = encoder.encode(t.termText()).toString(); value = encoder.encode(value).toString();
} }
catch (Exception ignored) {} // just use the direct text catch (Exception ignored) {} // just use the direct text
//Token m = new Token(value, t.startOffset(), t.endOffset(), name );
Token m = new Token(value, t.startOffset(), t.endOffset(), name );
if( inject ) { if( inject ) {
m.setPositionIncrement(0); save = (Token) t.clone();
save = m; save.setPositionIncrement(0);
} save.setTermBuffer(value.toCharArray(), 0, value.length());
else { } else {
// replace the token rather then add it too the stream t.setTermBuffer(value.toCharArray(), 0, value.length());
return m;
} }
} }
return t; return t;

View File

@ -19,6 +19,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.util.ArraysUtils;
import java.io.IOException; import java.io.IOException;
@ -30,23 +31,27 @@ public class RemoveDuplicatesTokenFilter extends BufferedTokenStream {
public RemoveDuplicatesTokenFilter(TokenStream input) {super(input);} public RemoveDuplicatesTokenFilter(TokenStream input) {super(input);}
protected Token process(Token t) throws IOException { protected Token process(Token t) throws IOException {
Token tok = read(); Token tok = read();
OUT: while (tok != null && tok.getPositionIncrement()==0) { while (tok != null && tok.getPositionIncrement()==0) {
if (null != t) { if (null != t) {
write(t); write(t);
t = null; t = null;
} }
boolean dup=false; boolean dup=false;
IN: for (Token outTok : output()) { for (Token outTok : output()) {
if (outTok.termText().equals(tok.termText())) { int tokLen = tok.termLength();
if (outTok.termLength() == tokLen && ArraysUtils.equals(outTok.termBuffer(), 0, tok.termBuffer(), 0, tokLen)) {
dup=true; dup=true;
break IN; //continue;;
} }
} }
if (!dup) if (!dup){
write(tok); write(tok);
}
tok = read(); tok = read();
} }
if (tok != null) pushBack(tok); if (tok != null) {
pushBack(tok);
}
return t; return t;
} }
} }

View File

@ -17,9 +17,9 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import java.io.IOException; import java.io.IOException;
@ -29,50 +29,54 @@ import java.io.IOException;
* @version $Id:$ * @version $Id:$
*/ */
public final class TrimFilter extends TokenFilter { public final class TrimFilter extends TokenFilter {
final boolean updateOffsets; final boolean updateOffsets;
public TrimFilter(TokenStream in, boolean updateOffsets ) { public TrimFilter(TokenStream in, boolean updateOffsets) {
super(in); super(in);
this.updateOffsets = updateOffsets; this.updateOffsets = updateOffsets;
} }
@Override @Override
public final Token next() throws IOException { public final Token next(Token in) throws IOException {
Token t = input.next(); Token t = input.next(in);
if (null == t || null == t.termText()) if (null == t || null == t.termBuffer() || t.termLength() == 0){
return t; return t;
}
char[] termBuffer = t.termBuffer();
int len = t.termLength();
int start = 0;
int end = 0;
int endOff = 0;
if( updateOffsets ) { // eat the first characters
String txt = t.termText(); //QUESTION: Should we use Character.isWhitespace() instead?
int start = 0; for (start = 0; start < len && termBuffer[start] <= ' '; start++) {
int end = txt.length();
int endOff = 0;
// eat the first characters
while ((start < end) && (txt.charAt(start) <= ' ')) {
start++;
}
// eat the end characters
while ((start < end) && (txt.charAt(end-1) <= ' ')) {
end--;
endOff++;
}
if( start > 0 || end < txt.length() ) {
int incr = t.getPositionIncrement();
t = new Token( t.termText().substring( start, end ),
t.startOffset()+start,
t.endOffset()-endOff,
t.type() );
t.setPositionIncrement( incr ); //+ start ); TODO? what should happen with the offset
}
} }
else { // eat the end characters
t.setTermText( t.termText().trim() ); for (end = len; end >= start && termBuffer[end - 1] <= ' '; end--) {
endOff++;
} }
if (start > 0 || end < len) {
if (start < end) {
t.setTermBuffer(t.termBuffer(), start, (end - start));
} else {
t.setTermLength(0);
}
if (updateOffsets) {
t.setStartOffset(t.startOffset() + start);
if (start < end) {
t.setEndOffset(t.endOffset() - endOff);
} //else if end is less than, start, then the term length is 0, so, no need to bother w/ the end offset
}
/*t = new Token( t.termText().substring( start, end ),
t.startOffset()+start,
t.endOffset()-endOff,
t.type() );*/
}
return t; return t;
} }
} }

View File

@ -192,7 +192,7 @@ final class WordDelimiterFilter extends TokenFilter {
// use the type of the first char as the type // use the type of the first char as the type
// of the token. // of the token.
private int tokType(Token t) { private int tokType(Token t) {
return charType(t.termText().charAt(0)); return charType(t.termBuffer()[0]);
} }
// There isn't really an efficient queue class, so we will // There isn't really an efficient queue class, so we will
@ -207,23 +207,22 @@ final class WordDelimiterFilter extends TokenFilter {
private Token newTok(Token orig, int start, int end) { private Token newTok(Token orig, int start, int end) {
int startOff = orig.startOffset(); int startOff = orig.startOffset();
int endOff = orig.endOffset(); int endOff = orig.endOffset();
String origStr = orig.termText();
// if length by start + end offsets doesn't match the term text then assume // if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets. // this is a synonym and don't adjust the offsets.
if (origStr.length() == endOff-startOff) { if (orig.termLength() == endOff-startOff) {
endOff = startOff + end; endOff = startOff + end;
startOff += start; startOff += start;
} }
return new Token(orig.termText().substring(start,end), Token newTok = new Token(startOff,
startOff,
endOff, endOff,
orig.type()); orig.type());
newTok.setTermBuffer(orig.termBuffer(), start, (end - start));
return newTok;
} }
public final Token next() throws IOException { public final Token next(Token in) throws IOException {
// check the queue first // check the queue first
if (queuePos<queue.size()) { if (queuePos<queue.size()) {
@ -248,25 +247,25 @@ final class WordDelimiterFilter extends TokenFilter {
Token t = input.next(); Token t = input.next();
if (t == null) return null; if (t == null) return null;
String s = t.termText(); char [] termBuffer = t.termBuffer();
int len = t.termLength();
int start=0; int start=0;
int end=s.length(); if (len ==0) continue;
if (end==0) continue;
origPosIncrement = t.getPositionIncrement(); origPosIncrement = t.getPositionIncrement();
// Avoid calling charType more than once for each char (basically // Avoid calling charType more than once for each char (basically
// avoid any backtracking). // avoid any backtracking).
// makes code slightly more difficult, but faster. // makes code slightly more difficult, but faster.
int ch=s.charAt(start); int ch=termBuffer[start];
int type=charType(ch); int type=charType(ch);
int numWords=0; int numWords=0;
while (start<end) { while (start< len) {
// first eat delimiters at the start of this subword // first eat delimiters at the start of this subword
while ((type & SUBWORD_DELIM)!=0 && ++start<end) { while ((type & SUBWORD_DELIM)!=0 && ++start< len) {
ch=s.charAt(start); ch=termBuffer[start];
type=charType(ch); type=charType(ch);
} }
@ -278,23 +277,23 @@ final class WordDelimiterFilter extends TokenFilter {
int lastType=type; // type of the previously read char int lastType=type; // type of the previously read char
while (pos<end) { while (pos< len) {
if (type!=lastType) { if (type!=lastType) {
// check and remove "'s" from the end of a token. // check and remove "'s" from the end of a token.
// the pattern to check for is // the pattern to check for is
// ALPHA "'" ("s"|"S") (SUBWORD_DELIM | END) // ALPHA "'" ("s"|"S") (SUBWORD_DELIM | END)
if ((lastType & ALPHA)!=0) { if ((lastType & ALPHA)!=0) {
if (ch=='\'' && pos+1<end if (ch=='\'' && pos+1< len
&& (s.charAt(pos+1)=='s' || s.charAt(pos+1)=='S')) && (termBuffer[pos+1]=='s' || termBuffer[pos+1]=='S'))
{ {
int subWordEnd=pos; int subWordEnd=pos;
if (pos+2>=end) { if (pos+2>= len) {
// end of string detected after "'s" // end of string detected after "'s"
pos+=2; pos+=2;
} else { } else {
// make sure that a delimiter follows "'s" // make sure that a delimiter follows "'s"
int ch2 = s.charAt(pos+2); int ch2 = termBuffer[pos+2];
int type2 = charType(ch2); int type2 = charType(ch2);
if ((type2 & SUBWORD_DELIM)!=0) { if ((type2 & SUBWORD_DELIM)!=0) {
// if delimiter, move position pointer // if delimiter, move position pointer
@ -340,7 +339,7 @@ final class WordDelimiterFilter extends TokenFilter {
} }
} }
if (++pos >= end) { if (++pos >= len) {
if (start==0) { if (start==0) {
// the subword is the whole original token, so // the subword is the whole original token, so
// return it unchanged. // return it unchanged.
@ -362,7 +361,7 @@ final class WordDelimiterFilter extends TokenFilter {
} }
lastType = type; lastType = type;
ch = s.charAt(pos); ch = termBuffer[pos];
type = charType(ch); type = charType(ch);
} }
@ -482,7 +481,7 @@ final class WordDelimiterFilter extends TokenFilter {
tok = lst.get(i); tok = lst.get(i);
if (catenateSubwords) { if (catenateSubwords) {
if (i==start) firstTok=tok; if (i==start) firstTok=tok;
sb.append(tok.termText()); sb.append(tok.termBuffer(), 0, tok.termLength());
} }
if (generateSubwords) { if (generateSubwords) {
queue.add(tok); queue.add(tok);

View File

@ -0,0 +1,35 @@
package org.apache.solr.util;
/**
*
*
**/
//Since Arrays.equals doesn't implement offsets for equals
public class ArraysUtils {
/**
* See if two array slices are the same.
*
* @param left The left array to compare
* @param offsetLeft The offset into the array. Must be positive
* @param right The right array to compare
* @param offsetRight the offset into the right array. Must be positive
* @param length The length of the section of the array to compare
* @return true if the two arrays, starting at their respective offsets, are equal
*
* @see java.util.Arrays#equals(char[], char[])
*/
public static boolean equals(char[] left, int offsetLeft, char[] right, int offsetRight, int length) {
if ((offsetLeft + length <= left.length) && (offsetRight + length <= right.length)) {
for (int i = 0; i < length; i++) {
if (left[offsetLeft + i] != right[offsetRight + i]) {
return false;
}
}
return true;
}
return false;
}
}

View File

@ -0,0 +1,96 @@
package org.apache.solr.analysis;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import net.sf.snowball.ext.EnglishStemmer;
import org.apache.solr.common.ResourceLoader;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.ArrayList;
import java.util.Collections;
public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
public void test() throws IOException {
EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer();
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
StringBuilder gold = new StringBuilder();
for (int i = 0; i < test.length; i++) {
stemmer.setCurrent(test[i]);
stemmer.stem();
gold.append(stemmer.getCurrent()).append(' ');
}
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
Map<String, String> args = new HashMap<String, String>();
factory.init(args);
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
String out = tsToString(factory.create(new IterTokenStream(test)));
assertEquals(gold.toString().trim(), out);
}
public void testProtected() throws Exception {
EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer();
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
StringBuilder gold = new StringBuilder();
for (int i = 0; i < test.length; i++) {
if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
stemmer.setCurrent(test[i]);
stemmer.stem();
gold.append(stemmer.getCurrent()).append(' ');
} else {
gold.append(test[i]).append(' ');
}
}
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
Map<String, String> args = new HashMap<String, String>();
args.put(EnglishPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
factory.init(args);
List<String> lines = new ArrayList<String>();
Collections.addAll(lines, "banks", "fledgling");
factory.inform(new LinesMockSolrResourceLoader(lines));
String out = tsToString(factory.create(new IterTokenStream(test)));
assertEquals(gold.toString().trim(), out);
}
class LinesMockSolrResourceLoader implements ResourceLoader {
List<String> lines;
LinesMockSolrResourceLoader(List<String> lines) {
this.lines = lines;
}
public List<String> getLines(String resource) throws IOException {
return lines;
}
public Object newInstance(String cname, String... subpackages) {
return null;
}
public InputStream openResource(String resource) throws IOException {
return null;
}
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class LengthFilterTest extends BaseTokenTestCase {
public void test() throws IOException {
LengthFilterFactory factory = new LengthFilterFactory();
Map<String, String> args = new HashMap<String, String>();
args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4));
args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
factory.init(args);
String[] test = {"foo", "foobar", "super-duper-trooper"};
String gold = "foobar";
String out = tsToString(factory.create(new IterTokenStream(test)));
assertEquals(gold.toString(), out);
}
}

View File

@ -27,8 +27,8 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
*/ */
public class TestHyphenatedWordsFilter extends BaseTokenTestCase { public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
public void testHyphenatedWords() throws Exception { public void testHyphenatedWords() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on"; String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on"; String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on and ecological";
// first test // first test
TokenStream ts = new WhitespaceTokenizer(new StringReader(input)); TokenStream ts = new WhitespaceTokenizer(new StringReader(input));
ts = new HyphenatedWordsFilter(ts); ts = new HyphenatedWordsFilter(ts);

View File

@ -17,76 +17,96 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import java.io.StringReader; import org.apache.lucene.analysis.Token;
import java.util.regex.Pattern;
import junit.framework.TestCase;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer;
import java.io.StringReader;
import java.util.regex.Pattern;
/** /**
* @version $Id:$ * @version $Id:$
*/ */
public class TestPatternReplaceFilter extends AnalysisTestCase { public class TestPatternReplaceFilter extends AnalysisTestCase {
public void testReplaceAll() throws Exception { public void testReplaceAll() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab"; String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(new StringReader(input)), (new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"), Pattern.compile("a*b"),
"-", true); "-", true);
assertEquals("-foo-foo-foo-", ts.next().termText()); Token token = ts.next();
assertEquals("-", ts.next().termText()); assertEquals("-foo-foo-foo-", new String(token.termBuffer(), 0, token.termLength()));
assertEquals("c-", ts.next().termText()); token = ts.next();
assertNull(ts.next()); assertEquals("-", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("c-", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
} }
public void testReplaceFirst() throws Exception { public void testReplaceFirst() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab"; String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(new StringReader(input)), (new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"), Pattern.compile("a*b"),
"-", false); "-", false);
assertEquals("-fooaabfooabfoob", ts.next().termText()); Token token = ts.next();
assertEquals("-", ts.next().termText()); assertEquals("-fooaabfooabfoob", new String(token.termBuffer(), 0, token.termLength()));
assertEquals("c-", ts.next().termText()); token = ts.next();
assertNull(ts.next()); assertEquals("-", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("c-", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
} }
public void testStripFirst() throws Exception { public void testStripFirst() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab"; String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(new StringReader(input)), (new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"), Pattern.compile("a*b"),
null, false); null, false);
assertEquals("fooaabfooabfoob", ts.next().termText()); Token token = ts.next();
assertEquals("", ts.next().termText()); assertEquals("fooaabfooabfoob", new String(token.termBuffer(), 0, token.termLength()));
assertEquals("c", ts.next().termText()); token = ts.next();
assertNull(ts.next()); assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("c", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
} }
public void testStripAll() throws Exception { public void testStripAll() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab"; String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(new StringReader(input)), (new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"), Pattern.compile("a*b"),
null, true); null, true);
assertEquals("foofoofoo", ts.next().termText()); Token token = ts.next();
assertEquals("", ts.next().termText()); assertEquals("foofoofoo", new String(token.termBuffer(), 0, token.termLength()));
assertEquals("c", ts.next().termText()); token = ts.next();
assertNull(ts.next()); assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("c", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
} }
public void testReplaceAllWithBackRef() throws Exception { public void testReplaceAllWithBackRef() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab"; String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(new StringReader(input)), (new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("(a*)b"), Pattern.compile("(a*)b"),
"$1\\$", true); "$1\\$", true);
assertEquals("aa$fooaa$fooa$foo$", ts.next().termText()); Token token = ts.next();
assertEquals("a$", ts.next().termText()); assertEquals("aa$fooaa$fooa$foo$", new String(token.termBuffer(), 0, token.termLength()));
assertEquals("caaaaaaaaa$", ts.next().termText()); token = ts.next();
assertNull(ts.next()); assertEquals("a$", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("caaaaaaaaa$", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
} }
} }

View File

@ -81,8 +81,8 @@ public class TestPhoneticFilter extends BaseTokenTestCase {
new IterTokenStream(stream.iterator()), enc, "text", inject ); new IterTokenStream(stream.iterator()), enc, "text", inject );
for( Token t : output ) { for( Token t : output ) {
Token got = filter.next(); Token got = filter.next(t);
assertEquals( t.termText(), got.termText()); assertEquals( new String(t.termBuffer(), 0, t.termLength()), new String(got.termBuffer(), 0, got.termLength()));
} }
assertNull( filter.next() ); // no more tokens assertNull( filter.next() ); // no more tokens
} }

View File

@ -35,11 +35,16 @@ public class TestTrimFilter extends BaseTokenTestCase {
new Token("cCc",11,15), new Token("cCc",11,15),
new Token(" ",16,20)), false ); new Token(" ",16,20)), false );
assertEquals("a", ts.next().termText()); Token token = ts.next();
assertEquals("b", ts.next().termText()); assertEquals("a", new String(token.termBuffer(), 0, token.termLength()));
assertEquals("cCc", ts.next().termText()); token = ts.next();
assertEquals("", ts.next().termText()); assertEquals("b", new String(token.termBuffer(), 0, token.termLength()));
assertNull(ts.next()); token = ts.next();
assertEquals("cCc", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
ts = new TrimFilter( new IterTokenStream( ts = new TrimFilter( new IterTokenStream(
new Token(" a", 0,2), new Token(" a", 0,2),

View File

@ -0,0 +1,48 @@
package org.apache.solr.util;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import junit.framework.TestCase;
public class ArraysUtilsTest extends TestCase {
public ArraysUtilsTest(String s) {
super(s);
}
protected void setUp() {
}
protected void tearDown() {
}
public void test() {
String left = "this is equal";
String right = left;
char[] leftChars = left.toCharArray();
char[] rightChars = right.toCharArray();
assertTrue(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 0, rightChars, 0, left.length()));
assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 1, rightChars, 0, left.length()));
assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 1, rightChars, 2, left.length()));
assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 25, rightChars, 0, left.length()));
assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 12, rightChars, 0, left.length()));
}
}